Skip to content

Commit c662bfa

Browse files
committed
cmd/compile: boost inlining into FORs
As already Than McIntosh mentioned it's a common practise to boost inlining to FORs, since the callsite could be "hotter". This patch implements this functionality. The implementation uses a stack of FORs to recognise calls which are in a loop. The stack is maintained alongside inlnode function works and contains information about ancenstor FORs relative to a current node in inlnode. There is "big" FOR which cost is >= inlineBigForCost(47). In such FORs no boost is applied. Updates #17566 The following results on GO1, while binary size not increased significantly 10441232 -> 10465920, which is less than 0.3%. goos: linux goarch: amd64 pkg: test/bench/go1 cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz name old time/op new time/op delta BinaryTree17-8 2.15s ± 1% 2.17s ± 1% +0.86% (p=0.041 n=6+6) Fannkuch11-8 2.70s ± 0% 2.72s ± 0% +0.71% (p=0.002 n=6+6) FmtFprintfEmpty-8 31.9ns ± 0% 31.6ns ± 0% -1.06% (p=0.008 n=5+5) FmtFprintfString-8 57.0ns ± 0% 58.3ns ± 0% +2.26% (p=0.004 n=6+5) FmtFprintfInt-8 65.2ns ± 0% 64.1ns ± 0% -1.65% (p=0.000 n=5+4) FmtFprintfIntInt-8 103ns ± 0% 102ns ± 0% -0.91% (p=0.000 n=5+6) FmtFprintfPrefixedInt-8 119ns ± 0% 118ns ± 0% -0.60% (p=0.008 n=5+5) FmtFprintfFloat-8 169ns ± 0% 171ns ± 0% +1.50% (p=0.004 n=5+6) FmtManyArgs-8 445ns ± 0% 445ns ± 0% ~ (p=0.506 n=6+5) GobDecode-8 4.37ms ± 1% 4.41ms ± 0% +0.79% (p=0.009 n=6+6) GobEncode-8 3.07ms ± 0% 3.05ms ± 0% -0.42% (p=0.004 n=5+6) Gzip-8 195ms ± 0% 194ms ± 0% -0.40% (p=0.009 n=5+6) Gunzip-8 28.2ms ± 0% 28.9ms ± 0% +2.22% (p=0.004 n=5+6) HTTPClientServer-8 45.0µs ± 1% 45.4µs ± 0% +0.97% (p=0.030 n=6+5) JSONEncode-8 8.01ms ± 0% 7.95ms ± 0% -0.78% (p=0.008 n=5+5) JSONDecode-8 35.3ms ± 1% 35.0ms ± 0% -1.04% (p=0.004 n=5+6) Mandelbrot200-8 4.50ms ± 0% 4.50ms ± 0% ~ (p=0.662 n=6+5) GoParse-8 3.03ms ± 1% 2.96ms ± 0% -2.41% (p=0.004 n=6+5) RegexpMatchEasy0_32-8 55.4ns ± 0% 53.8ns ± 0% -2.83% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 178ns ± 0% 162ns ± 1% -8.76% (p=0.004 n=5+6) RegexpMatchEasy1_32-8 50.1ns ± 0% 49.6ns ± 0% -0.92% (p=0.004 n=5+6) RegexpMatchEasy1_1K-8 271ns ± 1% 268ns ± 0% -1.15% (p=0.002 n=6+6) RegexpMatchMedium_32-8 949ns ± 0% 862ns ± 0% -9.20% (p=0.008 n=5+5) RegexpMatchMedium_1K-8 27.1µs ± 7% 27.4µs ± 7% ~ (p=0.589 n=6+6) RegexpMatchHard_32-8 1.28µs ± 2% 1.27µs ± 1% ~ (p=0.065 n=6+6) RegexpMatchHard_1K-8 38.5µs ± 0% 38.5µs ± 0% ~ (p=0.132 n=6+6) Revcomp-8 397ms ± 0% 397ms ± 0% ~ (p=1.000 n=6+6) Template-8 48.1ms ± 1% 47.8ms ± 0% -0.48% (p=0.016 n=5+5) TimeParse-8 213ns ± 0% 213ns ± 0% ~ (p=0.467 n=4+6) TimeFormat-8 295ns ± 1% 294ns ± 0% ~ (p=0.554 n=6+5) [Geo mean] 40.5µs 40.2µs -0.81% name old speed new speed delta GobDecode-8 176MB/s ± 1% 174MB/s ± 0% -0.79% (p=0.009 n=6+6) GobEncode-8 250MB/s ± 0% 251MB/s ± 0% +0.42% (p=0.004 n=5+6) Gzip-8 100MB/s ± 0% 100MB/s ± 0% +0.40% (p=0.009 n=5+6) Gunzip-8 687MB/s ± 0% 672MB/s ± 0% -2.17% (p=0.004 n=5+6) JSONEncode-8 242MB/s ± 0% 244MB/s ± 0% +0.78% (p=0.008 n=5+5) JSONDecode-8 54.9MB/s ± 1% 55.5MB/s ± 0% +1.05% (p=0.004 n=5+6) GoParse-8 19.1MB/s ± 1% 19.6MB/s ± 0% +2.48% (p=0.004 n=6+5) RegexpMatchEasy0_32-8 578MB/s ± 0% 594MB/s ± 0% +2.89% (p=0.008 n=5+5) RegexpMatchEasy0_1K-8 5.74GB/s ± 1% 6.31GB/s ± 1% +9.95% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 639MB/s ± 0% 645MB/s ± 0% +0.93% (p=0.004 n=5+6) RegexpMatchEasy1_1K-8 3.78GB/s ± 1% 3.82GB/s ± 0% +1.15% (p=0.002 n=6+6) RegexpMatchMedium_32-8 33.7MB/s ± 0% 37.1MB/s ± 0% +10.15% (p=0.008 n=5+5) RegexpMatchMedium_1K-8 37.9MB/s ± 6% 37.5MB/s ± 7% ~ (p=0.697 n=6+6) RegexpMatchHard_32-8 24.9MB/s ± 2% 25.1MB/s ± 1% ~ (p=0.058 n=6+6) RegexpMatchHard_1K-8 26.6MB/s ± 0% 26.6MB/s ± 0% ~ (p=0.195 n=6+6) Revcomp-8 640MB/s ± 0% 641MB/s ± 0% ~ (p=1.000 n=6+6) Template-8 40.4MB/s ± 1% 40.6MB/s ± 0% +0.47% (p=0.016 n=5+5) [Geo mean] 175MB/s 178MB/s +1.56%
1 parent ab7c904 commit c662bfa

File tree

3 files changed

+209
-26
lines changed

3 files changed

+209
-26
lines changed

src/cmd/compile/internal/inline/inl.go

+129-26
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,109 @@ const (
5151

5252
inlineBigFunctionNodes = 5000 // Functions with this many nodes are considered "big".
5353
inlineBigFunctionMaxCost = 20 // Max cost of inlinee when inlining into a "big" function.
54+
55+
inlineBigForCost = 51 // FORs with at least this cost are considered "big".
56+
inlineForMaxCost = 37 // FORs should be cheaper than this to boost inlining into themselves.
57+
inlineIntoForExtraCallCost = 6 // These extra costs were benchmarked to provided most benefit with no bad surprises.
58+
inlineIntoForExtraInlinableCallCost = 10
59+
inlineIntoForExtraBudget = 16 // Extra budget when inlining into FORs which are not "big".
60+
61+
// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
62+
inlineVisitorBudget = inlineMaxBudget + inlineIntoForExtraBudget
5463
)
5564

65+
// isInlinable checks if the function can be inlined in a 'typical' scenario
66+
// when no boosts are applied.
67+
func isInlinable(fn *ir.Func) bool {
68+
return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
69+
}
70+
71+
type forContext struct {
72+
cost int32
73+
}
74+
75+
type inlContext struct {
76+
// Map to keep track of functions that have been inlined at a particular
77+
// call site, in order to stop inlining when we reach the beginning of a
78+
// recursion cycle again. We don't inline immediately recursive functions,
79+
// but allow inlining if there is a recursion cycle of many functions.
80+
// Most likely, the inlining will stop before we even hit the beginning of
81+
// the cycle again, but the map catches the unusual case.
82+
inlinedCallees map[*ir.Func]bool
83+
84+
// Stack to recognise which call nodes are located inside fors, while doing inlnode.
85+
forsStack []forContext
86+
initialInlineBudget int32 // Initial inline budget, boosts are calculated related to this.
87+
}
88+
89+
func (ctx inlContext) canBoostInliningIntoFor() bool {
90+
// The decision is based on:
91+
// 1) The first FOR in the stack is not "big".
92+
// 2) The last FOR cost should be less inlineForMaxCost.
93+
return len(ctx.forsStack) > 0 && ctx.forsStack[0].cost < inlineBigForCost && ctx.forsStack[len(ctx.forsStack)-1].cost < inlineForMaxCost
94+
}
95+
96+
func (ctx *inlContext) Init(fn *ir.Func) {
97+
ctx.inlinedCallees = make(map[*ir.Func]bool)
98+
99+
if isBigFunc(fn) {
100+
ctx.initialInlineBudget = inlineBigFunctionMaxCost
101+
} else {
102+
ctx.initialInlineBudget = inlineMaxBudget
103+
}
104+
}
105+
106+
func (ctx *inlContext) PushFor(n ir.Node) {
107+
ctx.forsStack = append(ctx.forsStack, forContext{forCost(n)})
108+
109+
if base.Flag.LowerM > 1 {
110+
fmt.Printf("%v: add for to stack %v\n", ir.Line(n), ctx.forsStack)
111+
}
112+
}
113+
114+
func (ctx *inlContext) PopFor() {
115+
ctx.forsStack = ctx.forsStack[:len(ctx.forsStack)-1]
116+
}
117+
118+
func (ctx inlContext) InlineBudget() int32 {
119+
finalBudget := ctx.initialInlineBudget
120+
if ctx.canBoostInliningIntoFor() && ctx.initialInlineBudget == inlineMaxBudget {
121+
// Boosts only regular functions
122+
finalBudget += inlineIntoForExtraBudget
123+
}
124+
125+
return finalBudget
126+
}
127+
128+
func forCost(n ir.Node) int32 {
129+
cost := int32(inlineBigForCost)
130+
ir.Any(n, func(n ir.Node) bool {
131+
cost--
132+
133+
switch n.Op() {
134+
case ir.OCALLFUNC:
135+
call := n.(*ir.CallExpr)
136+
if call.NoInline {
137+
// These are deferred or go-ed calls, treating a FOR as "big".
138+
cost = -1
139+
break
140+
}
141+
142+
if ir.IsIntrinsicCall(call) {
143+
// Treat like any other node.
144+
break
145+
}
146+
147+
cost -= inlineIntoForExtraCallCost
148+
if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
149+
cost -= inlineIntoForExtraInlinableCallCost
150+
}
151+
}
152+
return cost < 0
153+
})
154+
return inlineBigForCost - cost
155+
}
156+
56157
// InlinePackage finds functions that can be inlined and clones them before walk expands them.
57158
func InlinePackage() {
58159
ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -167,7 +268,7 @@ func CanInline(fn *ir.Func) {
167268
// list. See issue 25249 for more context.
168269

169270
visitor := hairyVisitor{
170-
budget: inlineMaxBudget,
271+
budget: inlineVisitorBudget,
171272
extraCallCost: cc,
172273
}
173274
if visitor.tooHairy(fn) {
@@ -176,20 +277,24 @@ func CanInline(fn *ir.Func) {
176277
}
177278

178279
n.Func.Inl = &ir.Inline{
179-
Cost: inlineMaxBudget - visitor.budget,
280+
Cost: inlineVisitorBudget - visitor.budget,
180281
Dcl: pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
181282
Body: inlcopylist(fn.Body),
182283

183284
CanDelayResults: canDelayResults(fn),
184285
}
185286

186287
if base.Flag.LowerM > 1 {
187-
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
188-
} else if base.Flag.LowerM != 0 {
288+
if isInlinable(n.Func) {
289+
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
290+
} else {
291+
fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
292+
}
293+
} else if base.Flag.LowerM != 0 && isInlinable(n.Func) {
189294
fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
190295
}
191296
if logopt.Enabled() {
192-
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
297+
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
193298
}
194299
}
195300

@@ -241,7 +346,7 @@ func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
241346
return true
242347
}
243348
if v.budget < 0 {
244-
v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
349+
v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-v.budget, inlineVisitorBudget)
245350
return true
246351
}
247352
return false
@@ -493,20 +598,13 @@ func inlcopy(n ir.Node) ir.Node {
493598
func InlineCalls(fn *ir.Func) {
494599
savefn := ir.CurFunc
495600
ir.CurFunc = fn
496-
maxCost := int32(inlineMaxBudget)
497-
if isBigFunc(fn) {
498-
maxCost = inlineBigFunctionMaxCost
499-
}
500-
// Map to keep track of functions that have been inlined at a particular
501-
// call site, in order to stop inlining when we reach the beginning of a
502-
// recursion cycle again. We don't inline immediately recursive functions,
503-
// but allow inlining if there is a recursion cycle of many functions.
504-
// Most likely, the inlining will stop before we even hit the beginning of
505-
// the cycle again, but the map catches the unusual case.
506-
inlMap := make(map[*ir.Func]bool)
601+
602+
var inlCtx inlContext
603+
inlCtx.Init(fn)
604+
507605
var edit func(ir.Node) ir.Node
508606
edit = func(n ir.Node) ir.Node {
509-
return inlnode(n, maxCost, inlMap, edit)
607+
return inlnode(n, &inlCtx, edit)
510608
}
511609
ir.EditChildren(fn, edit)
512610
ir.CurFunc = savefn
@@ -525,11 +623,16 @@ func InlineCalls(fn *ir.Func) {
525623
// shorter and less complicated.
526624
// The result of inlnode MUST be assigned back to n, e.g.
527625
// n.Left = inlnode(n.Left)
528-
func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
626+
func inlnode(n ir.Node, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
529627
if n == nil {
530628
return n
531629
}
532630

631+
if n.Op() == ir.OFOR {
632+
ctx.PushFor(n)
633+
defer ctx.PopFor()
634+
}
635+
533636
switch n.Op() {
534637
case ir.ODEFER, ir.OGO:
535638
n := n.(*ir.GoDeferStmt)
@@ -584,7 +687,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
584687
break
585688
}
586689
if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
587-
n = mkinlcall(call, fn, maxCost, inlMap, edit)
690+
n = mkinlcall(call, fn, ctx, edit)
588691
}
589692
}
590693

@@ -657,20 +760,20 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
657760
// parameters.
658761
// The result of mkinlcall MUST be assigned back to n, e.g.
659762
// n.Left = mkinlcall(n.Left, fn, isddd)
660-
func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
763+
func mkinlcall(n *ir.CallExpr, fn *ir.Func, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
661764
if fn.Inl == nil {
662765
if logopt.Enabled() {
663766
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
664767
fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(fn)))
665768
}
666769
return n
667770
}
668-
if fn.Inl.Cost > maxCost {
771+
if fn.Inl.Cost > ctx.InlineBudget() {
669772
// The inlined function body is too big. Typically we use this check to restrict
670773
// inlining into very big functions. See issue 26546 and 17566.
671774
if logopt.Enabled() {
672775
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
673-
fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), maxCost))
776+
fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), ctx.InlineBudget()))
674777
}
675778
return n
676779
}
@@ -693,15 +796,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
693796
return n
694797
}
695798

696-
if inlMap[fn] {
799+
if ctx.inlinedCallees[fn] {
697800
if base.Flag.LowerM > 1 {
698801
fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
699802
}
700803
return n
701804
}
702-
inlMap[fn] = true
805+
ctx.inlinedCallees[fn] = true
703806
defer func() {
704-
inlMap[fn] = false
807+
ctx.inlinedCallees[fn] = false
705808
}()
706809

707810
typecheck.FixVariadicCall(n)

test/inline.go

+41
Original file line numberDiff line numberDiff line change
@@ -292,3 +292,44 @@ func conv2(v uint64) uint64 { // ERROR "can inline conv2"
292292
func conv1(v uint64) uint64 { // ERROR "can inline conv1"
293293
return uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(v)))))))))))
294294
}
295+
296+
// Inline into FORs
297+
func func_with_cost_88() {
298+
x := 200
299+
for i := 0; i < x; i++ {
300+
if i%2 == 0 {
301+
runtime.GC()
302+
} else {
303+
i += 2
304+
x += 1
305+
}
306+
}
307+
}
308+
309+
func func_with_fors() {
310+
func_with_cost_88()
311+
312+
for i := 0; i < 100; i++ {
313+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
314+
}
315+
316+
func_with_cost_88()
317+
func_with_cost_88()
318+
319+
for i := 0; i < 100; i++ {
320+
for j := 0; j < 100; j++ {
321+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
322+
}
323+
}
324+
325+
for i := 0; i < 100; i++ {
326+
for j := 0; j < 100; j++ {
327+
// All this calls can't be inline, since FOR is too big.
328+
func_with_cost_88()
329+
func_with_cost_88()
330+
func_with_cost_88()
331+
}
332+
}
333+
334+
func_with_cost_88()
335+
}

test/inline_for.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// errorcheck -0 -m=2
2+
3+
// Copyright 2021 The Go Authors. All rights reserved.
4+
// Use of this source code is governed by a BSD-style
5+
// license that can be found in the LICENSE file.
6+
7+
// Test, using compiler diagnostic flags, that inlining is working.
8+
// Compiles but does not run.
9+
10+
package foo
11+
12+
import "runtime"
13+
14+
func func_with() int { // ERROR "can inline func_with .*"
15+
return 10
16+
}
17+
18+
func func_with_cost_88() { // ERROR "can inline only into small FORs .*"
19+
x := 200
20+
for i := 0; i < x; i++ { // ERROR "add for to stack \[\{32\}\]"
21+
if i%2 == 0 {
22+
runtime.GC()
23+
} else {
24+
i += 2
25+
x += 1
26+
}
27+
}
28+
}
29+
30+
func func_with_fors() { // ERROR "cannot inline .*"
31+
for { // ERROR "add for to stack \[\{39\}\]"
32+
for { // ERROR "add for to stack \[\{39\} \{19\}\]"
33+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88" "add for to stack \[\{39\} \{19\} \{32\}\]"
34+
}
35+
for { // ERROR "add for to stack"
36+
func_with() // ERROR "inlining call to func_with"
37+
}
38+
}
39+
}

0 commit comments

Comments
 (0)