Skip to content

Commit c6931c2

Browse files
authored
[FuncSpec] Only compute Latency bonus when necessary (#113159)
Only compute the Latency component of a specialisation's Bonus when necessary, to avoid unnecessarily computing the Block Frequency Information for a Function.
1 parent 5b9c76b commit c6931c2

File tree

4 files changed

+238
-162
lines changed

4 files changed

+238
-162
lines changed

llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h

Lines changed: 13 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -140,41 +140,10 @@ struct Spec {
140140
: F(F), Sig(S), Score(Score) {}
141141
};
142142

143-
struct Bonus {
144-
unsigned CodeSize = 0;
145-
unsigned Latency = 0;
146-
147-
Bonus() = default;
148-
149-
Bonus(Cost CodeSize, Cost Latency) {
150-
int64_t Sz = *CodeSize.getValue();
151-
int64_t Ltc = *Latency.getValue();
152-
153-
assert(Sz >= 0 && Ltc >= 0 && "CodeSize and Latency cannot be negative");
154-
// It is safe to down cast since we know the arguments
155-
// cannot be negative and Cost is of type int64_t.
156-
this->CodeSize = static_cast<unsigned>(Sz);
157-
this->Latency = static_cast<unsigned>(Ltc);
158-
}
159-
160-
Bonus &operator+=(const Bonus RHS) {
161-
CodeSize += RHS.CodeSize;
162-
Latency += RHS.Latency;
163-
return *this;
164-
}
165-
166-
Bonus operator+(const Bonus RHS) const {
167-
return Bonus(CodeSize + RHS.CodeSize, Latency + RHS.Latency);
168-
}
169-
170-
bool operator==(const Bonus RHS) const {
171-
return CodeSize == RHS.CodeSize && Latency == RHS.Latency;
172-
}
173-
};
174-
175143
class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
144+
std::function<BlockFrequencyInfo &(Function &)> GetBFI;
145+
Function *F;
176146
const DataLayout &DL;
177-
BlockFrequencyInfo &BFI;
178147
TargetTransformInfo &TTI;
179148
SCCPSolver &Solver;
180149

@@ -192,26 +161,29 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
192161
ConstMap::iterator LastVisited;
193162

194163
public:
195-
InstCostVisitor(const DataLayout &DL, BlockFrequencyInfo &BFI,
196-
TargetTransformInfo &TTI, SCCPSolver &Solver)
197-
: DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {}
164+
InstCostVisitor(std::function<BlockFrequencyInfo &(Function &)> GetBFI,
165+
Function *F, const DataLayout &DL, TargetTransformInfo &TTI,
166+
SCCPSolver &Solver)
167+
: GetBFI(GetBFI), F(F), DL(DL), TTI(TTI), Solver(Solver) {}
198168

199169
bool isBlockExecutable(BasicBlock *BB) {
200170
return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
201171
}
202172

203-
Bonus getSpecializationBonus(Argument *A, Constant *C);
173+
Cost getCodeSizeSavingsForArg(Argument *A, Constant *C);
174+
175+
Cost getCodeSizeSavingsFromPendingPHIs();
204176

205-
Bonus getBonusFromPendingPHIs();
177+
Cost getLatencySavingsForKnownConstants();
206178

207179
private:
208180
friend class InstVisitor<InstCostVisitor, Constant *>;
209181

210182
static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
211183
DenseSet<BasicBlock *> &DeadBlocks);
212184

213-
Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
214-
Constant *C = nullptr);
185+
Cost getCodeSizeSavingsForUser(Instruction *User, Value *Use = nullptr,
186+
Constant *C = nullptr);
215187

216188
Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
217189
Cost estimateSwitchInst(SwitchInst &I);
@@ -283,9 +255,8 @@ class FunctionSpecializer {
283255
bool run();
284256

285257
InstCostVisitor getInstCostVisitorFor(Function *F) {
286-
auto &BFI = GetBFI(*F);
287258
auto &TTI = GetTTI(*F);
288-
return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver);
259+
return InstCostVisitor(GetBFI, F, M.getDataLayout(), TTI, Solver);
289260
}
290261

291262
private:

llvm/lib/Transforms/IPO/FunctionSpecialization.cpp

Lines changed: 102 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
112112
Cost InstCostVisitor::estimateBasicBlocks(
113113
SmallVectorImpl<BasicBlock *> &WorkList) {
114114
Cost CodeSize = 0;
115-
// Accumulate the instruction cost of each basic block weighted by frequency.
115+
// Accumulate the codesize savings of each basic block.
116116
while (!WorkList.empty()) {
117117
BasicBlock *BB = WorkList.pop_back_val();
118118

@@ -154,37 +154,73 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
154154
return KnownConstants.lookup(V);
155155
}
156156

157-
Bonus InstCostVisitor::getBonusFromPendingPHIs() {
158-
Bonus B;
157+
Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs() {
158+
Cost CodeSize;
159159
while (!PendingPHIs.empty()) {
160160
Instruction *Phi = PendingPHIs.pop_back_val();
161161
// The pending PHIs could have been proven dead by now.
162162
if (isBlockExecutable(Phi->getParent()))
163-
B += getUserBonus(Phi);
163+
CodeSize += getCodeSizeSavingsForUser(Phi);
164164
}
165-
return B;
165+
return CodeSize;
166166
}
167167

168-
/// Compute a bonus for replacing argument \p A with constant \p C.
169-
Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) {
168+
/// Compute the codesize savings for replacing argument \p A with constant \p C.
169+
Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) {
170170
LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
171171
<< C->getNameOrAsOperand() << "\n");
172-
Bonus B;
172+
Cost CodeSize;
173173
for (auto *U : A->users())
174174
if (auto *UI = dyn_cast<Instruction>(U))
175175
if (isBlockExecutable(UI->getParent()))
176-
B += getUserBonus(UI, A, C);
176+
CodeSize += getCodeSizeSavingsForUser(UI, A, C);
177177

178178
LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = "
179-
<< B.CodeSize << ", Latency = " << B.Latency
180-
<< "} for argument " << *A << "\n");
181-
return B;
179+
<< CodeSize << "} for argument " << *A << "\n");
180+
return CodeSize;
182181
}
183182

184-
Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
183+
/// Compute the latency savings from replacing all arguments with constants for
184+
/// a specialization candidate. As this function computes the latency savings
185+
/// for all Instructions in KnownConstants at once, it should be called only
186+
/// after every instruction has been visited, i.e. after:
187+
///
188+
/// * getCodeSizeSavingsForArg has been run for every constant argument of a
189+
/// specialization candidate
190+
///
191+
/// * getCodeSizeSavingsFromPendingPHIs has been run
192+
///
193+
/// to ensure that the latency savings are calculated for all Instructions we
194+
/// have visited and found to be constant.
195+
Cost InstCostVisitor::getLatencySavingsForKnownConstants() {
196+
auto &BFI = GetBFI(*F);
197+
Cost TotalLatency = 0;
198+
199+
for (auto Pair : KnownConstants) {
200+
Instruction *I = dyn_cast<Instruction>(Pair.first);
201+
if (!I)
202+
continue;
203+
204+
uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() /
205+
BFI.getEntryFreq().getFrequency();
206+
207+
Cost Latency =
208+
Weight * TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
209+
210+
LLVM_DEBUG(dbgs() << "FnSpecialization: {Latency = " << Latency
211+
<< "} for instruction " << *I << "\n");
212+
213+
TotalLatency += Latency;
214+
}
215+
216+
return TotalLatency;
217+
}
218+
219+
Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,
220+
Constant *C) {
185221
// We have already propagated a constant for this user.
186222
if (KnownConstants.contains(User))
187-
return {0, 0};
223+
return 0;
188224

189225
// Cache the iterator before visiting.
190226
LastVisited = Use ? KnownConstants.insert({Use, C}).first
@@ -198,7 +234,7 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
198234
} else {
199235
C = visit(*User);
200236
if (!C)
201-
return {0, 0};
237+
return 0;
202238
}
203239

204240
// Even though it doesn't make sense to bind switch and branch instructions
@@ -208,23 +244,15 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
208244

209245
CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);
210246

211-
uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
212-
BFI.getEntryFreq().getFrequency();
213-
214-
Cost Latency = Weight *
215-
TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);
216-
217247
LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize
218-
<< ", Latency = " << Latency << "} for user "
219-
<< *User << "\n");
248+
<< "} for user " << *User << "\n");
220249

221-
Bonus B(CodeSize, Latency);
222250
for (auto *U : User->users())
223251
if (auto *UI = dyn_cast<Instruction>(U))
224252
if (UI != User && isBlockExecutable(UI->getParent()))
225-
B += getUserBonus(UI, User, C);
253+
CodeSize += getCodeSizeSavingsForUser(UI, User, C);
226254

227-
return B;
255+
return CodeSize;
228256
}
229257

230258
Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
@@ -809,6 +837,18 @@ static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) {
809837
return Clone;
810838
}
811839

840+
/// Get the unsigned Value of given Cost object. Assumes the Cost is always
841+
/// non-negative, which is true for both TCK_CodeSize and TCK_Latency, and
842+
/// always Valid.
843+
static unsigned getCostValue(const Cost &C) {
844+
int64_t Value = *C.getValue();
845+
846+
assert(Value >= 0 && "CodeSize and Latency cannot be negative");
847+
// It is safe to down cast since we know the arguments cannot be negative and
848+
// Cost is of type int64_t.
849+
return static_cast<unsigned>(Value);
850+
}
851+
812852
bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
813853
SmallVectorImpl<Spec> &AllSpecs,
814854
SpecMap &SM) {
@@ -875,48 +915,67 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
875915
AllSpecs[Index].CallSites.push_back(&CS);
876916
} else {
877917
// Calculate the specialisation gain.
878-
Bonus B;
918+
Cost CodeSize;
879919
unsigned Score = 0;
880920
InstCostVisitor Visitor = getInstCostVisitorFor(F);
881921
for (ArgInfo &A : S.Args) {
882-
B += Visitor.getSpecializationBonus(A.Formal, A.Actual);
922+
CodeSize += Visitor.getCodeSizeSavingsForArg(A.Formal, A.Actual);
883923
Score += getInliningBonus(A.Formal, A.Actual);
884924
}
885-
B += Visitor.getBonusFromPendingPHIs();
925+
CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs();
886926

887-
888-
LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
889-
<< B.CodeSize << ", Latency = " << B.Latency
890-
<< ", Inlining = " << Score << "}\n");
891-
892-
FunctionGrowth[F] += FuncSize - B.CodeSize;
893-
894-
auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
895-
unsigned FuncGrowth) -> bool {
927+
auto IsProfitable = [&]() -> bool {
896928
// No check required.
897929
if (ForceSpecialization)
898930
return true;
931+
932+
unsigned CodeSizeSavings = getCostValue(CodeSize);
933+
// TODO: We should only accumulate codesize increase of specializations
934+
// that are actually created.
935+
FunctionGrowth[F] += FuncSize - CodeSizeSavings;
936+
937+
LLVM_DEBUG(
938+
dbgs() << "FnSpecialization: Specialization bonus {Inlining = "
939+
<< Score << " (" << (Score * 100 / FuncSize) << "%)}\n");
940+
899941
// Minimum inlining bonus.
900942
if (Score > MinInliningBonus * FuncSize / 100)
901943
return true;
944+
945+
LLVM_DEBUG(
946+
dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
947+
<< CodeSizeSavings << " ("
948+
<< (CodeSizeSavings * 100 / FuncSize) << "%)}\n");
949+
902950
// Minimum codesize savings.
903-
if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
951+
if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100)
904952
return false;
953+
954+
// Lazily compute the Latency, to avoid unnecessarily computing BFI.
955+
unsigned LatencySavings =
956+
getCostValue(Visitor.getLatencySavingsForKnownConstants());
957+
958+
LLVM_DEBUG(
959+
dbgs() << "FnSpecialization: Specialization bonus {Latency = "
960+
<< LatencySavings << " ("
961+
<< (LatencySavings * 100 / FuncSize) << "%)}\n");
962+
905963
// Minimum latency savings.
906-
if (B.Latency < MinLatencySavings * FuncSize / 100)
964+
if (LatencySavings < MinLatencySavings * FuncSize / 100)
907965
return false;
908966
// Maximum codesize growth.
909-
if (FuncGrowth / FuncSize > MaxCodeSizeGrowth)
967+
if (FunctionGrowth[F] / FuncSize > MaxCodeSizeGrowth)
910968
return false;
969+
970+
Score += std::max(CodeSizeSavings, LatencySavings);
911971
return true;
912972
};
913973

914974
// Discard unprofitable specialisations.
915-
if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F]))
975+
if (!IsProfitable())
916976
continue;
917977

918978
// Create a new specialisation entry.
919-
Score += std::max(B.CodeSize, B.Latency);
920979
auto &Spec = AllSpecs.emplace_back(F, S, Score);
921980
if (CS.getFunction() != F)
922981
Spec.CallSites.push_back(&CS);

llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,25 @@
44

55
; This test case is trying to validate that the postdomtree is preserved
66
; correctly by the ipsccp pass. A tricky bug was introduced in commit
7-
; 1b1232047e83b69561 when PDT would be feched using getCachedAnalysis in order
7+
; 1b1232047e83b69561 when PDT would be fetched using getCachedAnalysis in order
88
; to setup a DomTreeUpdater (to update the PDT during transformation in order
99
; to preserve the analysis). But given that commit the PDT could end up being
1010
; required and calculated via BlockFrequency analysis. So the problem was that
1111
; when setting up the DomTreeUpdater we used a nullptr in case PDT wasn't
12-
; cached at the begininng of IPSCCP, to indicate that no updates where needed
12+
; cached at the beginning of IPSCCP, to indicate that no updates were needed
1313
; for PDT. But then the PDT was calculated, given the input IR, and preserved
1414
; using the non-updated state (as the DTU wasn't configured for updating the
1515
; PDT).
1616

1717
; CHECK-NOT: <badref>
1818
; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries.
19-
; CHECK-NEXT: [1] <<exit node>> {4294967295,4294967295} [0]
20-
; CHECK-NEXT: [2] %for.cond34 {4294967295,4294967295} [1]
21-
; CHECK-NEXT: [3] %for.cond16 {4294967295,4294967295} [2]
22-
; CHECK-NEXT: [2] %for.body {4294967295,4294967295} [1]
23-
; CHECK-NEXT: [2] %if.end4 {4294967295,4294967295} [1]
24-
; CHECK-NEXT: [3] %entry {4294967295,4294967295} [2]
25-
; CHECK-NEXT: Roots: %for.cond34 %for.body
19+
; CHECK-NEXT: [1] <<exit node>> {4294967295,4294967295} [0]
20+
; CHECK-NEXT: [2] %for.body {4294967295,4294967295} [1]
21+
; CHECK-NEXT: [2] %if.end4 {4294967295,4294967295} [1]
22+
; CHECK-NEXT: [3] %entry {4294967295,4294967295} [2]
23+
; CHECK-NEXT: [2] %for.cond34 {4294967295,4294967295} [1]
24+
; CHECK-NEXT: [3] %for.cond16 {4294967295,4294967295} [2]
25+
; CHECK-NEXT: Roots: %for.body %for.cond34
2626
; CHECK-NEXT: PostDominatorTree for function: bar
2727
; CHECK-NOT: <badref>
2828

0 commit comments

Comments
 (0)