Skip to content

Commit 1071279

Browse files
WenleiHetstellar
authored andcommitted
[CSSPGO] Use merged base profile for hot threshold calculation
Context-sensitive profile effectively split a function profile into many copies each representing the CFG profile of a particular calling context. That makes the count distribution looks more flat as we now have more function profiles each with lower counts, which in turn leads to lower hot thresholds. Now we tells threshold computation to merge context profile first before calculating percentile based cutoffs to compensate for seemingly flat context profile. This can be controlled by swtich `sample-profile-contextless-threshold`. Earlier measurement showed ~0.4% perf boost with this tuning on spec2k6 for CSSPGO (with pseudo-probe and new inliner). Differential Revision: https://reviews.llvm.org/D95980
1 parent db88d92 commit 1071279

File tree

6 files changed

+192
-11
lines changed

6 files changed

+192
-11
lines changed

llvm/include/llvm/ProfileData/ProfileCommon.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "llvm/ADT/ArrayRef.h"
1818
#include "llvm/IR/ProfileSummary.h"
1919
#include "llvm/ProfileData/InstrProf.h"
20+
#include "llvm/ProfileData/SampleProf.h"
2021
#include "llvm/Support/Error.h"
2122
#include <algorithm>
2223
#include <cstdint>
@@ -89,6 +90,8 @@ class SampleProfileSummaryBuilder final : public ProfileSummaryBuilder {
8990

9091
void addRecord(const sampleprof::FunctionSamples &FS,
9192
bool isCallsiteSample = false);
93+
std::unique_ptr<ProfileSummary> computeSummaryForProfiles(
94+
const StringMap<sampleprof::FunctionSamples> &Profiles);
9295
std::unique_ptr<ProfileSummary> getSummary();
9396
};
9497

llvm/lib/ProfileData/ProfileSummaryBuilder.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,14 @@
1818
#include "llvm/ProfileData/ProfileCommon.h"
1919
#include "llvm/ProfileData/SampleProf.h"
2020
#include "llvm/Support/Casting.h"
21+
#include "llvm/Support/CommandLine.h"
2122

2223
using namespace llvm;
2324

25+
cl::opt<bool> UseContextLessSummary(
26+
"profile-summary-contextless", cl::Hidden, cl::init(false), cl::ZeroOrMore,
27+
cl::desc("Merge context profiles before calculating thresholds."));
28+
2429
// A set of cutoff values. Each value, when divided by ProfileSummary::Scale
2530
// (which is 1000000) is a desired percentile of total counts.
2631
static const uint32_t DefaultCutoffsData[] = {
@@ -111,6 +116,35 @@ std::unique_ptr<ProfileSummary> SampleProfileSummaryBuilder::getSummary() {
111116
MaxFunctionCount, NumCounts, NumFunctions);
112117
}
113118

119+
std::unique_ptr<ProfileSummary>
120+
SampleProfileSummaryBuilder::computeSummaryForProfiles(
121+
const StringMap<sampleprof::FunctionSamples> &Profiles) {
122+
assert(NumFunctions == 0 &&
123+
"This can only be called on an empty summary builder");
124+
StringMap<sampleprof::FunctionSamples> ContextLessProfiles;
125+
const StringMap<sampleprof::FunctionSamples> *ProfilesToUse = &Profiles;
126+
// For CSSPGO, context-sensitive profile effectively split a function profile
127+
// into many copies each representing the CFG profile of a particular calling
128+
// context. That makes the count distribution looks more flat as we now have
129+
// more function profiles each with lower counts, which in turn leads to lower
130+
// hot thresholds. To compensate for that, by defauly we merge context
131+
// profiles before coumputing profile summary.
132+
if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS &&
133+
!UseContextLessSummary.getNumOccurrences())) {
134+
for (const auto &I : Profiles) {
135+
ContextLessProfiles[I.second.getName()].merge(I.second);
136+
}
137+
ProfilesToUse = &ContextLessProfiles;
138+
}
139+
140+
for (const auto &I : *ProfilesToUse) {
141+
const sampleprof::FunctionSamples &Profile = I.second;
142+
addRecord(Profile);
143+
}
144+
145+
return getSummary();
146+
}
147+
114148
std::unique_ptr<ProfileSummary> InstrProfSummaryBuilder::getSummary() {
115149
computeDetailedSummary();
116150
return std::make_unique<ProfileSummary>(

llvm/lib/ProfileData/SampleProfReader.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1610,9 +1610,5 @@ SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C,
16101610
// profile. Binary format has the profile summary in its header.
16111611
void SampleProfileReader::computeSummary() {
16121612
SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1613-
for (const auto &I : Profiles) {
1614-
const FunctionSamples &Profile = I.second;
1615-
Builder.addRecord(Profile);
1616-
}
1617-
Summary = Builder.getSummary();
1613+
Summary = Builder.computeSummaryForProfiles(Profiles);
16181614
}

llvm/lib/ProfileData/SampleProfWriter.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -752,9 +752,5 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
752752
void SampleProfileWriter::computeSummary(
753753
const StringMap<FunctionSamples> &ProfileMap) {
754754
SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
755-
for (const auto &I : ProfileMap) {
756-
const FunctionSamples &Profile = I.second;
757-
Builder.addRecord(Profile);
758-
}
759-
Summary = Builder.getSummary();
755+
Summary = Builder.computeSummaryForProfiles(ProfileMap);
760756
}

llvm/test/Transforms/SampleProfile/csspgo-inline.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030

3131
; INLINE-NEW-LIMIT1-NOT: remark
3232

33-
; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11
3433
; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11
3534
; INLINE-NEW-LIMIT2-NOT: remark
3635

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
; Test for CSSPGO's profile summary computation with and without pre-merging context profiles
2+
3+
; RUN: opt < %s -passes=sample-profile,print-profile-summary -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-summary-cutoff-hot=999900 -profile-sample-accurate -profile-summary-contextless=0 -S -o /dev/null 2>&1 | FileCheck %s --check-prefix=SUMMARY-UNMERGED
4+
; RUN: opt < %s -passes=sample-profile,print-profile-summary -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-summary-cutoff-hot=999900 -profile-sample-accurate -profile-summary-contextless=1 -S -o /dev/null 2>&1 | FileCheck %s --check-prefix=SUMMARY-MERGED
5+
6+
; SUMMARY-UNMERGED: main :hot entry
7+
; SUMMARY-MERGED-NOT: main :hot entry
8+
9+
10+
@factor = dso_local global i32 3, align 4, !dbg !0
11+
12+
define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
13+
entry:
14+
br label %for.body, !dbg !25
15+
16+
for.cond.cleanup: ; preds = %for.body
17+
ret i32 %add3, !dbg !27
18+
19+
for.body: ; preds = %for.body, %entry
20+
%x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
21+
%r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
22+
%call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
23+
%add = add nuw nsw i32 %x.011, 1, !dbg !31
24+
%call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
25+
%add2 = add i32 %call, %r.010, !dbg !34
26+
%add3 = add i32 %add2, %call1, !dbg !35
27+
%dec = add nsw i32 %x.011, -1, !dbg !36
28+
%cmp = icmp eq i32 %x.011, 0, !dbg !38
29+
br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
30+
}
31+
32+
define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
33+
entry:
34+
%add = add nsw i32 %x, 100000, !dbg !44
35+
%call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
36+
ret i32 %call, !dbg !46
37+
}
38+
39+
define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
40+
entry:
41+
%cmp = icmp sgt i32 %x, 0, !dbg !57
42+
br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
43+
44+
while.cond2.preheader: ; preds = %entry
45+
%cmp313 = icmp slt i32 %x, 0, !dbg !60
46+
br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
47+
48+
while.body: ; preds = %while.body, %entry
49+
%x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
50+
%tmp = load volatile i32, i32* @factor, align 4, !dbg !64
51+
%call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
52+
%sub = sub nsw i32 %x.addr.016, %call, !dbg !68
53+
%cmp1 = icmp sgt i32 %sub, 0, !dbg !69
54+
br i1 %cmp1, label %while.body, label %if.end, !dbg !71
55+
56+
while.body4: ; preds = %while.body4, %while.cond2.preheader
57+
%x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
58+
%tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
59+
%call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
60+
%add = add nsw i32 %call5, %x.addr.114, !dbg !75
61+
%cmp3 = icmp slt i32 %add, 0, !dbg !60
62+
br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
63+
64+
if.end: ; preds = %while.body4, %while.body, %while.cond2.preheader
65+
%x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
66+
ret i32 %x.addr.2, !dbg !76
67+
}
68+
69+
define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
70+
entry:
71+
%sub = add nsw i32 %x, -100000, !dbg !51
72+
%call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
73+
ret i32 %call, !dbg !53
74+
}
75+
76+
declare i32 @_Z3fibi(i32)
77+
78+
attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
79+
attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
80+
81+
!llvm.dbg.cu = !{!2}
82+
!llvm.module.flags = !{!14, !15, !16}
83+
!llvm.ident = !{!17}
84+
85+
!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
86+
!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
87+
!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
88+
!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
89+
!4 = !{}
90+
!5 = !{!6, !10, !11}
91+
!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
92+
!7 = !DISubroutineType(types: !8)
93+
!8 = !{!9, !9}
94+
!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
95+
!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
96+
!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
97+
!12 = !{!0}
98+
!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
99+
!14 = !{i32 7, !"Dwarf Version", i32 4}
100+
!15 = !{i32 2, !"Debug Info Version", i32 3}
101+
!16 = !{i32 1, !"wchar_size", i32 4}
102+
!17 = !{!"clang version 11.0.0"}
103+
!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
104+
!19 = !DISubroutineType(types: !20)
105+
!20 = !{!9}
106+
!21 = !{!22, !23}
107+
!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
108+
!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
109+
!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
110+
!25 = !DILocation(line: 13, column: 3, scope: !26)
111+
!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
112+
!27 = !DILocation(line: 17, column: 3, scope: !18)
113+
!28 = !DILocation(line: 14, column: 10, scope: !29)
114+
!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
115+
!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
116+
!31 = !DILocation(line: 14, column: 29, scope: !29)
117+
!32 = !DILocation(line: 14, column: 21, scope: !33)
118+
!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
119+
!34 = !DILocation(line: 14, column: 19, scope: !29)
120+
!35 = !DILocation(line: 14, column: 7, scope: !29)
121+
!36 = !DILocation(line: 13, column: 33, scope: !37)
122+
!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
123+
!38 = !DILocation(line: 13, column: 26, scope: !39)
124+
!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
125+
!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
126+
!44 = !DILocation(line: 27, column: 22, scope: !40)
127+
!45 = !DILocation(line: 27, column: 11, scope: !40)
128+
!46 = !DILocation(line: 29, column: 3, scope: !40)
129+
!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
130+
!51 = !DILocation(line: 33, column: 22, scope: !47)
131+
!52 = !DILocation(line: 33, column: 11, scope: !47)
132+
!53 = !DILocation(line: 35, column: 3, scope: !47)
133+
!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
134+
!57 = !DILocation(line: 49, column: 9, scope: !58)
135+
!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
136+
!59 = !DILocation(line: 49, column: 7, scope: !54)
137+
!60 = !DILocation(line: 58, column: 14, scope: !61)
138+
!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
139+
!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
140+
!63 = !DILocation(line: 58, column: 5, scope: !61)
141+
!64 = !DILocation(line: 52, column: 16, scope: !65)
142+
!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
143+
!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
144+
!67 = !DILocation(line: 52, column: 12, scope: !65)
145+
!68 = !DILocation(line: 52, column: 9, scope: !65)
146+
!69 = !DILocation(line: 51, column: 14, scope: !70)
147+
!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
148+
!71 = !DILocation(line: 51, column: 5, scope: !70)
149+
!72 = !DILocation(line: 59, column: 16, scope: !73)
150+
!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
151+
!74 = !DILocation(line: 59, column: 12, scope: !73)
152+
!75 = !DILocation(line: 59, column: 9, scope: !73)
153+
!76 = !DILocation(line: 63, column: 3, scope: !54)

0 commit comments

Comments
 (0)