-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
Changes from 9 commits
a052059
6409a4b
59ecf00
809320f
b561986
808e5c2
756d7a4
763b40a
48a9ed8
71e0307
07c8df2
3e77fd8
e693285
77ba4b4
fbed3bc
07af8f7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -132,6 +132,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( | |
"with -Os/-Oz"), | ||
cl::init(true), cl::Hidden); | ||
|
||
static cl::opt<bool> ForceMemsetPatternIntrinsic( | ||
"loop-idiom-force-memset-pattern-intrinsic", | ||
cl::desc("Enable use of the memset.pattern intrinsic"), cl::init(false), | ||
cl::Hidden); | ||
|
||
namespace { | ||
|
||
class LoopIdiomRecognize { | ||
|
@@ -303,10 +308,15 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { | |
L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; | ||
|
||
HasMemset = TLI->has(LibFunc_memset); | ||
// TODO: Unconditionally enable use of the memset pattern intrinsic (or at | ||
// least, opt-in via target hook) once we are confident it will never result | ||
// in worse codegen than without. For now, use it only when we would have | ||
// previously emitted a libcall to memset_pattern16 (or unless this is | ||
// overridden by command line option). | ||
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); | ||
HasMemcpy = TLI->has(LibFunc_memcpy); | ||
|
||
if (HasMemset || HasMemsetPattern || HasMemcpy) | ||
if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) | ||
if (SE->hasLoopInvariantBackedgeTakenCount(L)) | ||
return runOnCountableLoop(); | ||
|
||
|
@@ -392,14 +402,12 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { | |
if (Size > 16) | ||
return nullptr; | ||
|
||
// If the constant is exactly 16 bytes, just use it. | ||
if (Size == 16) | ||
return C; | ||
// For now, don't handle types that aren't int, floats, or pointers. | ||
if (!isa<ConstantInt>(C) && !isa<ConstantFP>(C) && | ||
!isa<PointerType>(C->getType())) | ||
return nullptr; | ||
|
||
// Otherwise, we'll use an array of the constants. | ||
unsigned ArraySize = 16 / Size; | ||
ArrayType *AT = ArrayType::get(V->getType(), ArraySize); | ||
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); | ||
return C; | ||
} | ||
|
||
LoopIdiomRecognize::LegalStoreKind | ||
|
@@ -463,8 +471,9 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { | |
// It looks like we can use SplatValue. | ||
return LegalStoreKind::Memset; | ||
} | ||
if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && | ||
// Don't create memset_pattern16s with address spaces. | ||
if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) && | ||
!DisableLIRP::Memset && | ||
// Don't create memset.pattern intrinsic calls with address spaces. | ||
StorePtr->getType()->getPointerAddressSpace() == 0 && | ||
getMemSetPatternValue(StoredVal, DL)) { | ||
// It looks like we can use PatternValue! | ||
|
@@ -1064,53 +1073,93 @@ bool LoopIdiomRecognize::processLoopStridedStore( | |
return Changed; | ||
|
||
// Okay, everything looks good, insert the memset. | ||
// MemsetArg is the number of bytes for the memset libcall, and the number | ||
// of pattern repetitions if the memset.pattern intrinsic is being used. | ||
Value *MemsetArg; | ||
std::optional<int64_t> BytesWritten; | ||
|
||
if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) { | ||
const SCEV *TripCountS = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a bunch of complexity in this section of the change that I'm not getting. Since we need the NumBytes (in the original code) for the AAInfo, why not just leave that alone? Isn't count (for memset.pattern intrinsic) just the trip count of the loop by definition? |
||
SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop); | ||
if (!Expander.isSafeToExpand(TripCountS)) | ||
return Changed; | ||
const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); | ||
if (!ConstStoreSize) | ||
return Changed; | ||
Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy, | ||
Preheader->getTerminator()); | ||
uint64_t PatternRepsPerTrip = | ||
(ConstStoreSize->getValue()->getZExtValue() * 8) / | ||
DL->getTypeSizeInBits(PatternValue->getType()); | ||
// If ConstStoreSize is not equal to the width of PatternValue, then | ||
// MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else | ||
// MemSetArg is just TripCount. | ||
MemsetArg = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this just weird way of compute NumBytes? |
||
PatternRepsPerTrip == 1 | ||
? TripCount | ||
: Builder.CreateMul(TripCount, | ||
Builder.getIntN(IntIdxTy->getIntegerBitWidth(), | ||
PatternRepsPerTrip)); | ||
if (auto *CI = dyn_cast<ConstantInt>(TripCount)) | ||
BytesWritten = | ||
CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue(); | ||
} else { | ||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
|
||
Value *NumBytes = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
MemsetArg = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
if (auto *CI = dyn_cast<ConstantInt>(MemsetArg)) | ||
BytesWritten = CI->getZExtValue(); | ||
} | ||
assert(MemsetArg && "MemsetArg should have been set"); | ||
|
||
if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) | ||
if (!SplatValue && !(ForceMemsetPatternIntrinsic || | ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))) | ||
return Changed; | ||
|
||
AAMDNodes AATags = TheStore->getAAMetadata(); | ||
for (Instruction *Store : Stores) | ||
AATags = AATags.merge(Store->getAAMetadata()); | ||
if (auto CI = dyn_cast<ConstantInt>(NumBytes)) | ||
AATags = AATags.extendTo(CI->getZExtValue()); | ||
if (BytesWritten) | ||
AATags = AATags.extendTo(BytesWritten.value()); | ||
else | ||
AATags = AATags.extendTo(-1); | ||
|
||
CallInst *NewCall; | ||
if (SplatValue) { | ||
NewCall = Builder.CreateMemSet( | ||
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), | ||
BasePtr, SplatValue, MemsetArg, MaybeAlign(StoreAlignment), | ||
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); | ||
} else { | ||
assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); | ||
// Everything is emitted in default address space | ||
Type *Int8PtrTy = DestInt8PtrTy; | ||
|
||
StringRef FuncName = "memset_pattern16"; | ||
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, | ||
Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); | ||
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); | ||
|
||
// Otherwise we should form a memset_pattern16. PatternValue is known to be | ||
// an constant array of 16-bytes. Plop the value into a mergable global. | ||
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, | ||
GlobalValue::PrivateLinkage, | ||
PatternValue, ".memset_pattern"); | ||
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. | ||
GV->setAlignment(Align(16)); | ||
Value *PatternPtr = GV; | ||
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); | ||
assert(ForceMemsetPatternIntrinsic || | ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); | ||
assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size"); | ||
|
||
Value *PatternArg; | ||
IntegerType *PatternArgTy = | ||
Builder.getIntNTy(DL->getTypeSizeInBits(PatternValue->getType())); | ||
|
||
// If the pattern value can be casted directly to an integer argument, use | ||
// that. Otherwise (e.g. if the value is a global pointer), create a | ||
// GlobalVariable and load from it. | ||
if (isa<ConstantInt>(PatternValue)) | ||
PatternArg = PatternValue; | ||
else if (isa<ConstantFP>(PatternValue)) | ||
PatternArg = Builder.CreateBitCast(PatternValue, PatternArgTy); | ||
else if (isa<PointerType>(PatternValue->getType())) | ||
PatternArg = Builder.CreatePtrToInt(PatternValue, PatternArgTy); | ||
else | ||
report_fatal_error("Unexpected PatternValue type"); | ||
|
||
NewCall = Builder.CreateIntrinsic(Intrinsic::experimental_memset_pattern, | ||
{DestInt8PtrTy, PatternArgTy, IntIdxTy}, | ||
{BasePtr, PatternArg, MemsetArg, | ||
ConstantInt::getFalse(M->getContext())}); | ||
|
||
// Set the TBAA info if present. | ||
if (AATags.TBAA) | ||
|
@@ -1400,7 +1449,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( | |
AAMDNodes AATags = TheLoad->getAAMetadata(); | ||
AAMDNodes StoreAATags = TheStore->getAAMetadata(); | ||
AATags = AATags.merge(StoreAATags); | ||
if (auto CI = dyn_cast<ConstantInt>(NumBytes)) | ||
if (auto *CI = dyn_cast<ConstantInt>(NumBytes)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. precommit this please |
||
AATags = AATags.extendTo(CI->getZExtValue()); | ||
else | ||
AATags = AATags.extendTo(-1); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 < %s -S | FileCheck %s | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 -loop-idiom-force-memset-pattern-intrinsic < %s -S \ | ||
; RUN: | FileCheck -check-prefix=CHECK-INTRIN %s | ||
|
||
define dso_local void @double_memset(ptr nocapture %p) { | ||
; CHECK-LABEL: @double_memset( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK: for.cond.cleanup: | ||
; CHECK-NEXT: ret void | ||
; CHECK: for.body: | ||
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 [[I_07]] | ||
; CHECK-NEXT: store double 3.141590e+00, ptr [[PTR1]], align 1 | ||
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
; CHECK-INTRIN-LABEL: @double_memset( | ||
; CHECK-INTRIN-NEXT: entry: | ||
; CHECK-INTRIN-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 4614256650576692846, i64 16, i1 false) | ||
; CHECK-INTRIN-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK-INTRIN: for.cond.cleanup: | ||
; CHECK-INTRIN-NEXT: ret void | ||
; CHECK-INTRIN: for.body: | ||
; CHECK-INTRIN-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-INTRIN-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]] | ||
; CHECK-INTRIN-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-INTRIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-INTRIN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
entry: | ||
br label %for.body | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] | ||
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07 | ||
store double 3.14159e+00, ptr %ptr1, align 1 | ||
%inc = add nuw nsw i64 %i.07, 1 | ||
%exitcond.not = icmp eq i64 %inc, 16 | ||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body | ||
} | ||
;. | ||
; CHECK-INTRIN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } | ||
;. |
Uh oh!
There was an error while loading. Please reload this page.