-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
Changes from 16 commits
a052059
6409a4b
59ecf00
809320f
b561986
808e5c2
756d7a4
763b40a
48a9ed8
71e0307
07c8df2
3e77fd8
e693285
77ba4b4
fbed3bc
07af8f7
b70871c
23dcffe
13ae5b8
e6bb52c
c15584d
c178f9f
7a5495d
d8b4ae1
43a05fd
c6714f2
cdd5029
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,6 +149,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( | |
"with -Os/-Oz"), | ||
cl::init(true), cl::Hidden); | ||
|
||
static cl::opt<bool> ForceMemsetPatternIntrinsic( | ||
"loop-idiom-force-memset-pattern-intrinsic", | ||
cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), | ||
cl::Hidden); | ||
|
||
namespace { | ||
|
||
class LoopIdiomRecognize { | ||
|
@@ -322,10 +327,15 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { | |
L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; | ||
|
||
HasMemset = TLI->has(LibFunc_memset); | ||
// TODO: Unconditionally enable use of the memset pattern intrinsic (or at | ||
// least, opt-in via target hook) once we are confident it will never result | ||
// in worse codegen than without. For now, use it only when the target | ||
// supports memset_pattern16 libcall (or unless this is overridden by | ||
// command line option). | ||
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); | ||
HasMemcpy = TLI->has(LibFunc_memcpy); | ||
|
||
if (HasMemset || HasMemsetPattern || HasMemcpy) | ||
if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) | ||
if (SE->hasLoopInvariantBackedgeTakenCount(L)) | ||
return runOnCountableLoop(); | ||
|
||
|
@@ -411,14 +421,12 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { | |
if (Size > 16) | ||
return nullptr; | ||
|
||
// If the constant is exactly 16 bytes, just use it. | ||
if (Size == 16) | ||
return C; | ||
// For now, don't handle types that aren't int, floats, or pointers. | ||
if (!isa<ConstantInt>(C) && !isa<ConstantFP>(C) && | ||
asb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
!isa<PointerType>(C->getType())) | ||
return nullptr; | ||
|
||
// Otherwise, we'll use an array of the constants. | ||
unsigned ArraySize = 16 / Size; | ||
ArrayType *AT = ArrayType::get(V->getType(), ArraySize); | ||
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); | ||
return C; | ||
} | ||
|
||
LoopIdiomRecognize::LegalStoreKind | ||
|
@@ -482,8 +490,9 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { | |
// It looks like we can use SplatValue. | ||
return LegalStoreKind::Memset; | ||
} | ||
if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && | ||
// Don't create memset_pattern16s with address spaces. | ||
if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) && | ||
!DisableLIRP::Memset && | ||
// Don't create memset.pattern intrinsic calls with address spaces. | ||
StorePtr->getType()->getPointerAddressSpace() == 0 && | ||
getMemSetPatternValue(StoredVal, DL)) { | ||
// It looks like we can use PatternValue! | ||
|
@@ -1083,53 +1092,92 @@ bool LoopIdiomRecognize::processLoopStridedStore( | |
return Changed; | ||
|
||
// Okay, everything looks good, insert the memset. | ||
// MemsetArg is the number of bytes for the memset libcall, and the number | ||
// of pattern repetitions if the memset.pattern intrinsic is being used. | ||
Value *MemsetArg; | ||
std::optional<int64_t> BytesWritten; | ||
|
||
if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) { | ||
const SCEV *TripCountS = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a bunch of complexity in this section of the change that I'm not getting. Since we need the NumBytes (in the original code) for the AAInfo, why not just leave that alone? Isn't count (for memset.pattern intrinsic) just the trip count of the loop by definition? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think this might be reworked, but we're down to little enough code here I'm okay with that being a followup. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the problem is that there may be multiple stores in one loop iteration? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it's the case of multiple stores in a loop iteration. I could perhaps build the necessary value with numbytes and a div, but I'm wary of cases where it may not be folded away. |
||
SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop); | ||
if (!Expander.isSafeToExpand(TripCountS)) | ||
return Changed; | ||
const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); | ||
if (!ConstStoreSize) | ||
return Changed; | ||
Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy, | ||
Preheader->getTerminator()); | ||
uint64_t PatternRepsPerTrip = | ||
(ConstStoreSize->getValue()->getZExtValue() * 8) / | ||
DL->getTypeSizeInBits(PatternValue->getType()); | ||
// If ConstStoreSize is not equal to the width of PatternValue, then | ||
// MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else | ||
// MemSetArg is just TripCount. | ||
MemsetArg = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this just weird way of compute NumBytes? |
||
PatternRepsPerTrip == 1 | ||
? TripCount | ||
: Builder.CreateMul(TripCount, | ||
Builder.getIntN(IntIdxTy->getIntegerBitWidth(), | ||
PatternRepsPerTrip)); | ||
if (auto *CI = dyn_cast<ConstantInt>(TripCount)) | ||
BytesWritten = | ||
CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue(); | ||
} else { | ||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
|
||
Value *NumBytes = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
MemsetArg = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
if (auto *CI = dyn_cast<ConstantInt>(MemsetArg)) | ||
BytesWritten = CI->getZExtValue(); | ||
} | ||
assert(MemsetArg && "MemsetArg should have been set"); | ||
|
||
if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) | ||
if (!SplatValue && !(ForceMemsetPatternIntrinsic || | ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))) | ||
return Changed; | ||
|
||
AAMDNodes AATags = TheStore->getAAMetadata(); | ||
for (Instruction *Store : Stores) | ||
AATags = AATags.merge(Store->getAAMetadata()); | ||
if (auto CI = dyn_cast<ConstantInt>(NumBytes)) | ||
AATags = AATags.extendTo(CI->getZExtValue()); | ||
if (BytesWritten) | ||
AATags = AATags.extendTo(BytesWritten.value()); | ||
else | ||
AATags = AATags.extendTo(-1); | ||
|
||
CallInst *NewCall; | ||
if (SplatValue) { | ||
NewCall = Builder.CreateMemSet( | ||
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), | ||
BasePtr, SplatValue, MemsetArg, MaybeAlign(StoreAlignment), | ||
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); | ||
} else { | ||
assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); | ||
// Everything is emitted in default address space | ||
Type *Int8PtrTy = DestInt8PtrTy; | ||
|
||
StringRef FuncName = "memset_pattern16"; | ||
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, | ||
Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); | ||
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); | ||
|
||
// Otherwise we should form a memset_pattern16. PatternValue is known to be | ||
// an constant array of 16-bytes. Plop the value into a mergable global. | ||
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, | ||
GlobalValue::PrivateLinkage, | ||
PatternValue, ".memset_pattern"); | ||
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. | ||
GV->setAlignment(Align(16)); | ||
Value *PatternPtr = GV; | ||
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); | ||
assert(ForceMemsetPatternIntrinsic || | ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); | ||
assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size"); | ||
|
||
Value *PatternArg; | ||
IntegerType *PatternArgTy = | ||
Builder.getIntNTy(DL->getTypeSizeInBits(PatternValue->getType())); | ||
|
||
if (isa<ConstantInt>(PatternValue)) | ||
PatternArg = PatternValue; | ||
else if (isa<ConstantFP>(PatternValue)) | ||
PatternArg = Builder.CreateBitCast(PatternValue, PatternArgTy); | ||
else if (isa<PointerType>(PatternValue->getType())) | ||
PatternArg = Builder.CreatePtrToInt(PatternValue, PatternArgTy); | ||
else | ||
report_fatal_error("Unexpected PatternValue type"); | ||
|
||
NewCall = Builder.CreateIntrinsic(Intrinsic::experimental_memset_pattern, | ||
{DestInt8PtrTy, PatternArgTy, IntIdxTy}, | ||
{BasePtr, PatternArg, MemsetArg, | ||
ConstantInt::getFalse(M->getContext())}); | ||
if (StoreAlignment) | ||
cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment); | ||
|
||
// Set the TBAA info if present. | ||
if (AATags.TBAA) | ||
|
@@ -1419,7 +1467,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( | |
AAMDNodes AATags = TheLoad->getAAMetadata(); | ||
AAMDNodes StoreAATags = TheStore->getAAMetadata(); | ||
AATags = AATags.merge(StoreAATags); | ||
if (auto CI = dyn_cast<ConstantInt>(NumBytes)) | ||
if (auto *CI = dyn_cast<ConstantInt>(NumBytes)) | ||
asb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
AATags = AATags.extendTo(CI->getZExtValue()); | ||
else | ||
AATags = AATags.extendTo(-1); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 < %s -S | FileCheck %s | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 -loop-idiom-force-memset-pattern-intrinsic < %s -S \ | ||
; RUN: | FileCheck -check-prefix=CHECK-INTRIN %s | ||
|
||
define dso_local void @double_memset(ptr nocapture %p) { | ||
; CHECK-LABEL: @double_memset( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK: for.cond.cleanup: | ||
; CHECK-NEXT: ret void | ||
; CHECK: for.body: | ||
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 [[I_07]] | ||
; CHECK-NEXT: store double 3.141590e+00, ptr [[PTR1]], align 1 | ||
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
; CHECK-INTRIN-LABEL: @double_memset( | ||
; CHECK-INTRIN-NEXT: entry: | ||
; CHECK-INTRIN-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr align 1 [[P:%.*]], i64 4614256650576692846, i64 16, i1 false) | ||
; CHECK-INTRIN-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK-INTRIN: for.cond.cleanup: | ||
; CHECK-INTRIN-NEXT: ret void | ||
; CHECK-INTRIN: for.body: | ||
; CHECK-INTRIN-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-INTRIN-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]] | ||
; CHECK-INTRIN-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-INTRIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-INTRIN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
entry: | ||
br label %for.body | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] | ||
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07 | ||
store double 3.14159e+00, ptr %ptr1, align 1 | ||
%inc = add nuw nsw i64 %i.07, 1 | ||
%exitcond.not = icmp eq i64 %inc, 16 | ||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body | ||
} | ||
;. | ||
; CHECK-INTRIN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } | ||
;. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would mildly suggest dropping this cl::opt from the final patch. We probably should have a generic means to force enable a TLI function, but having one only for memset.pattern seems slightly odd. Not a strongly held view, will leave to your discretion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do have a patch somewhere that adds an opt flag to do this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#145808
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think force enabling of a TLI function is slightly orthogonal. This patch aims to introduce memset.pattern in just those cases where memset_pattern16 would have been available, even though we do have codegen support for targets without memset_pattern16. The rationale is that using memset.pattern on all targets in this way would be a big codegen change. So the flag means it's easy to test the more eager selection of the intrinsic.