Skip to content

Commit 47187a4

Browse files
felixgensrip-dd
authored andcommitted
runtime: fix profile stack trace depth regression
Previously it was possible for mutex and block profile stack traces to contain up to 32 frames in Stack0 or the resulting pprof profiles. CL 533258 changed this behavior by using some of the space to record skipped frames that are discarded when performing delayed inline expansion. This has lowered the effective maximum stack size from 32 to 27 (the max skip value is 5), which can be seen as a small regression. Add TestProfilerStackDepth to demonstrate the issue and protect all profile types from similar regressions in the future. Fix the issue by increasing the internal maxStack limit to take the maxSkip value into account. Assert that the maxSkip value is never exceeded when recording mutex and block profile stack traces. Three alternative solutions to the problem were considered and discarded: 1) Revert CL 533258 and give up on frame pointer unwinding. This seems unappealing as we would lose the performance benefits of frame pointer unwinding. 2) Discard skipped frames when recording the initial stack trace. This would require eager inline expansion for up to maxSkip frames and partially negate the performance benefits of frame pointer unwinding. 3) Accept and document the new behavior. This would simplify the implementation, but seems more confusing from a user perspective. It also complicates the creation of test cases that make assertions about the maximum profiling stack depth. The execution tracer still has the same issue due to CL 463835. This should be addressed in a follow-up CL. Co-authored-by: Nick Ripley <[email protected]> Change-Id: Ibf4dbf08a5166c9cb32470068c69f58bc5f98d2c Reviewed-on: https://go-review.googlesource.com/c/go/+/586657 Reviewed-by: Austin Clements <[email protected]> Reviewed-by: Cherry Mui <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent 1cb13ac commit 47187a4

File tree

2 files changed

+166
-2
lines changed

2 files changed

+166
-2
lines changed

src/runtime/mprof.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,29 @@ const (
4343
// Note that it's only used internally as a guard against
4444
// wildly out-of-bounds slicing of the PCs that come after
4545
// a bucket struct, and it could increase in the future.
46-
// The "+ 1" is to account for the first stack entry being
46+
// The term "1" accounts for the first stack entry being
4747
// taken up by a "skip" sentinel value for profilers which
4848
// defer inline frame expansion until the profile is reported.
49-
maxStack = 32 + 1
49+
// The term "maxSkip" is for frame pointer unwinding, where we
50+
// want to end up with maxLogicalStack frames but will discard
51+
// some "physical" frames to account for skipping.
52+
maxStack = 1 + maxSkip + maxLogicalStack
53+
54+
// maxLogicalStack is the maximum stack size of a call stack
55+
// to encode in a profile. This counts "logical" frames, which
56+
// includes inlined frames. We may record more than this many
57+
// "physical" frames when using frame pointer unwinding to account
58+
// for deferred handling of skipping frames & inline expansion.
59+
maxLogicalStack = 32
60+
// maxSkip is to account for deferred inline expansion
61+
// when using frame pointer unwinding. We record the stack
62+
// with "physical" frame pointers but handle skipping "logical"
63+
// frames at some point after collecting the stack. So
64+
// we need extra space in order to avoid getting fewer than the
65+
// desired maximum number of frames after expansion.
66+
// This should be at least as large as the largest skip value
67+
// used for profiling; otherwise stacks may be truncated inconsistently
68+
maxSkip = 5
5069
)
5170

5271
type bucketType int
@@ -513,6 +532,11 @@ func blocksampled(cycles, rate int64) bool {
513532
// skip should be positive if this event is recorded from the current stack
514533
// (e.g. when this is not called from a system stack)
515534
func saveblockevent(cycles, rate int64, skip int, which bucketType) {
535+
if skip > maxSkip {
536+
print("requested skip=", skip)
537+
throw("invalid skip value")
538+
}
539+
516540
gp := getg()
517541
mp := acquirem() // we must not be preempted while accessing profstack
518542
nstk := 1

src/runtime/pprof/pprof_test.go

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2431,3 +2431,143 @@ func TestTimeVDSO(t *testing.T) {
24312431
}
24322432
}
24332433
}
2434+
2435+
func TestProfilerStackDepth(t *testing.T) {
2436+
// Disable sampling, otherwise it's difficult to assert anything.
2437+
oldMemRate := runtime.MemProfileRate
2438+
runtime.MemProfileRate = 1
2439+
runtime.SetBlockProfileRate(1)
2440+
oldMutexRate := runtime.SetMutexProfileFraction(1)
2441+
t.Cleanup(func() {
2442+
runtime.MemProfileRate = oldMemRate
2443+
runtime.SetBlockProfileRate(0)
2444+
runtime.SetMutexProfileFraction(oldMutexRate)
2445+
})
2446+
2447+
const depth = 32
2448+
go produceProfileEvents(t, depth)
2449+
awaitBlockedGoroutine(t, "chan receive", "goroutineDeep", 1)
2450+
2451+
tests := []struct {
2452+
profiler string
2453+
prefix []string
2454+
}{
2455+
{"heap", []string{"runtime/pprof.allocDeep"}},
2456+
{"block", []string{"runtime.chanrecv1", "runtime/pprof.blockChanDeep"}},
2457+
{"mutex", []string{"sync.(*Mutex).Unlock", "runtime/pprof.blockMutexDeep"}},
2458+
{"goroutine", []string{"runtime.gopark", "runtime.chanrecv", "runtime.chanrecv1", "runtime/pprof.goroutineDeep"}},
2459+
}
2460+
2461+
for _, test := range tests {
2462+
t.Run(test.profiler, func(t *testing.T) {
2463+
var buf bytes.Buffer
2464+
if err := Lookup(test.profiler).WriteTo(&buf, 0); err != nil {
2465+
t.Fatalf("failed to write heap profile: %v", err)
2466+
}
2467+
p, err := profile.Parse(&buf)
2468+
if err != nil {
2469+
t.Fatalf("failed to parse heap profile: %v", err)
2470+
}
2471+
t.Logf("Profile = %v", p)
2472+
2473+
stks := stacks(p)
2474+
var stk []string
2475+
for _, s := range stks {
2476+
if hasPrefix(s, test.prefix) {
2477+
stk = s
2478+
break
2479+
}
2480+
}
2481+
if len(stk) != depth {
2482+
t.Fatalf("want stack depth = %d, got %d", depth, len(stk))
2483+
}
2484+
2485+
if rootFn, wantFn := stk[depth-1], "runtime/pprof.produceProfileEvents"; rootFn != wantFn {
2486+
t.Fatalf("want stack stack root %s, got %v", wantFn, rootFn)
2487+
}
2488+
})
2489+
}
2490+
}
2491+
2492+
func hasPrefix(stk []string, prefix []string) bool {
2493+
if len(prefix) > len(stk) {
2494+
return false
2495+
}
2496+
for i := range prefix {
2497+
if stk[i] != prefix[i] {
2498+
return false
2499+
}
2500+
}
2501+
return true
2502+
}
2503+
2504+
// ensure that stack records are valid map keys (comparable)
2505+
var _ = map[runtime.MemProfileRecord]struct{}{}
2506+
var _ = map[runtime.StackRecord]struct{}{}
2507+
2508+
// allocDeep calls itself n times before calling fn.
2509+
func allocDeep(n int) {
2510+
if n > 1 {
2511+
allocDeep(n - 1)
2512+
return
2513+
}
2514+
memSink = make([]byte, 1<<20)
2515+
}
2516+
2517+
// blockChanDeep produces a block profile event at stack depth n, including the
2518+
// caller.
2519+
func blockChanDeep(t *testing.T, n int) {
2520+
if n > 1 {
2521+
blockChanDeep(t, n-1)
2522+
return
2523+
}
2524+
ch := make(chan struct{})
2525+
go func() {
2526+
awaitBlockedGoroutine(t, "chan receive", "blockChanDeep", 1)
2527+
ch <- struct{}{}
2528+
}()
2529+
<-ch
2530+
}
2531+
2532+
// blockMutexDeep produces a block profile event at stack depth n, including the
2533+
// caller.
2534+
func blockMutexDeep(t *testing.T, n int) {
2535+
if n > 1 {
2536+
blockMutexDeep(t, n-1)
2537+
return
2538+
}
2539+
var mu sync.Mutex
2540+
go func() {
2541+
mu.Lock()
2542+
mu.Lock()
2543+
}()
2544+
awaitBlockedGoroutine(t, "sync.Mutex.Lock", "blockMutexDeep", 1)
2545+
mu.Unlock()
2546+
}
2547+
2548+
// goroutineDeep blocks at stack depth n, including the caller until the test is
2549+
// finished.
2550+
func goroutineDeep(t *testing.T, n int) {
2551+
if n > 1 {
2552+
goroutineDeep(t, n-1)
2553+
return
2554+
}
2555+
wait := make(chan struct{}, 1)
2556+
t.Cleanup(func() {
2557+
wait <- struct{}{}
2558+
})
2559+
<-wait
2560+
}
2561+
2562+
// produceProfileEvents produces pprof events at the given stack depth and then
2563+
// blocks in goroutineDeep until the test completes. The stack traces are
2564+
// guaranteed to have exactly the desired depth with produceProfileEvents as
2565+
// their root frame which is expected by TestProfilerStackDepth.
2566+
func produceProfileEvents(t *testing.T, depth int) {
2567+
allocDeep(depth - 1) // -1 for produceProfileEvents, **
2568+
blockChanDeep(t, depth-2) // -2 for produceProfileEvents, **, chanrecv1
2569+
blockMutexDeep(t, depth-2) // -2 for produceProfileEvents, **, Unlock
2570+
memSink = nil
2571+
runtime.GC()
2572+
goroutineDeep(t, depth-4) // -4 for produceProfileEvents, **, chanrecv1, chanrev, gopark
2573+
}

0 commit comments

Comments
 (0)