Skip to content

Commit dcb5de5

Browse files
mknyszekgopherbot
authored andcommitted
cmd/trace/v2: tolerate traces with broken tails
This change modifies cmd/trace/v2 to tolerate traces with incomplete/broken generations at the tail. These broken tails can be created if a program crashes while a trace is being produced. Although the runtime tries to flush the trace on some panics, it may still produce some extra trace data that is incomplete. This change modifies cmd/trace/v2 to still work on any complete generations, even if there are incomplete/broken generations at the tail end of the trace. Basically, the tool now just tracks when the last good generation ended (via Sync events) and truncates the trace to that point when it encounters an error. This change also revamps the text output of the tool to emit regular progress notifications as well as warnings as to how much of the trace data was lost. Fixes #65316. Change-Id: I877d39993bc02a81eebe647db9c2be17635bcec8 Reviewed-on: https://go-review.googlesource.com/c/go/+/580135 Reviewed-by: Carlos Amedee <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Auto-Submit: Michael Knyszek <[email protected]>
1 parent 4324d5a commit dcb5de5

File tree

2 files changed

+113
-9
lines changed

2 files changed

+113
-9
lines changed

src/cmd/trace/v2/jsontrace_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ func getTestTrace(t *testing.T, testPath string) *parsedTrace {
283283
}
284284

285285
// Parse the test trace.
286-
parsed, err := parseTrace(&trace)
286+
parsed, err := parseTrace(&trace, int64(trace.Len()))
287287
if err != nil {
288288
t.Fatalf("failed to parse trace: %v", err)
289289
}

src/cmd/trace/v2/main.go

Lines changed: 112 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ import (
1414
"net"
1515
"net/http"
1616
"os"
17+
"sync/atomic"
18+
"time"
1719

1820
"internal/trace/v2/raw"
1921

@@ -28,9 +30,16 @@ func Main(traceFile, httpAddr, pprof string, debug int) error {
2830
}
2931
defer tracef.Close()
3032

33+
// Get the size of the trace file.
34+
fi, err := tracef.Stat()
35+
if err != nil {
36+
return fmt.Errorf("failed to stat trace file: %v", err)
37+
}
38+
traceSize := fi.Size()
39+
3140
// Handle requests for profiles.
3241
if pprof != "" {
33-
parsed, err := parseTrace(tracef)
42+
parsed, err := parseTrace(tracef, traceSize)
3443
if err != nil {
3544
return err
3645
}
@@ -72,14 +81,23 @@ func Main(traceFile, httpAddr, pprof string, debug int) error {
7281
addr := "http://" + ln.Addr().String()
7382

7483
log.Print("Preparing trace for viewer...")
75-
parsed, err := parseTrace(tracef)
84+
parsed, err := parseTraceInteractive(tracef, traceSize)
7685
if err != nil {
7786
return err
7887
}
7988
// N.B. tracef not needed after this point.
8089
// We might double-close, but that's fine; we ignore the error.
8190
tracef.Close()
8291

92+
// Print a nice message for a partial trace.
93+
if parsed.err != nil {
94+
log.Printf("Encountered error, but able to proceed. Error: %v", parsed.err)
95+
96+
lost := parsed.size - parsed.valid
97+
pct := float64(lost) / float64(parsed.size) * 100
98+
log.Printf("Lost %.2f%% of the latest trace data due to error (%s of %s)", pct, byteCount(lost), byteCount(parsed.size))
99+
}
100+
83101
log.Print("Splitting trace for viewer...")
84102
ranges, err := splitTrace(parsed)
85103
if err != nil {
@@ -140,29 +158,79 @@ func Main(traceFile, httpAddr, pprof string, debug int) error {
140158
return fmt.Errorf("failed to start http server: %w", err)
141159
}
142160

161+
func parseTraceInteractive(tr io.Reader, size int64) (parsed *parsedTrace, err error) {
162+
done := make(chan struct{})
163+
cr := countingReader{r: tr}
164+
go func() {
165+
parsed, err = parseTrace(&cr, size)
166+
done <- struct{}{}
167+
}()
168+
ticker := time.NewTicker(5 * time.Second)
169+
progressLoop:
170+
for {
171+
select {
172+
case <-ticker.C:
173+
case <-done:
174+
ticker.Stop()
175+
break progressLoop
176+
}
177+
progress := cr.bytesRead.Load()
178+
pct := float64(progress) / float64(size) * 100
179+
log.Printf("%s of %s (%.1f%%) processed...", byteCount(progress), byteCount(size), pct)
180+
}
181+
return
182+
}
183+
143184
type parsedTrace struct {
144-
events []tracev2.Event
145-
summary *trace.Summary
185+
events []tracev2.Event
186+
summary *trace.Summary
187+
size, valid int64
188+
err error
146189
}
147190

148-
func parseTrace(tr io.Reader) (*parsedTrace, error) {
149-
r, err := tracev2.NewReader(tr)
191+
func parseTrace(rr io.Reader, size int64) (*parsedTrace, error) {
192+
// Set up the reader.
193+
cr := countingReader{r: rr}
194+
r, err := tracev2.NewReader(&cr)
150195
if err != nil {
151196
return nil, fmt.Errorf("failed to create trace reader: %w", err)
152197
}
198+
199+
// Set up state.
153200
s := trace.NewSummarizer()
154201
t := new(parsedTrace)
202+
var validBytes int64
203+
var validEvents int
155204
for {
156205
ev, err := r.ReadEvent()
157206
if err == io.EOF {
207+
validBytes = cr.bytesRead.Load()
208+
validEvents = len(t.events)
209+
break
210+
}
211+
if err != nil {
212+
t.err = err
158213
break
159-
} else if err != nil {
160-
return nil, fmt.Errorf("failed to read event: %w", err)
161214
}
162215
t.events = append(t.events, ev)
163216
s.Event(&t.events[len(t.events)-1])
217+
218+
if ev.Kind() == tracev2.EventSync {
219+
validBytes = cr.bytesRead.Load()
220+
validEvents = len(t.events)
221+
}
222+
}
223+
224+
// Check to make sure we got at least one good generation.
225+
if validEvents == 0 {
226+
return nil, fmt.Errorf("failed to parse any useful part of the trace: %v", t.err)
164227
}
228+
229+
// Finish off the parsedTrace.
165230
t.summary = s.Finalize()
231+
t.valid = validBytes
232+
t.size = size
233+
t.events = t.events[:validEvents]
166234
return t, nil
167235
}
168236

@@ -217,3 +285,39 @@ func debugRawEvents(trace io.Reader) error {
217285
fmt.Println(ev.String())
218286
}
219287
}
288+
289+
type countingReader struct {
290+
r io.Reader
291+
bytesRead atomic.Int64
292+
}
293+
294+
func (c *countingReader) Read(buf []byte) (n int, err error) {
295+
n, err = c.r.Read(buf)
296+
c.bytesRead.Add(int64(n))
297+
return n, err
298+
}
299+
300+
type byteCount int64
301+
302+
func (b byteCount) String() string {
303+
var suffix string
304+
var divisor int64
305+
switch {
306+
case b < 1<<10:
307+
suffix = "B"
308+
divisor = 1
309+
case b < 1<<20:
310+
suffix = "KiB"
311+
divisor = 1 << 10
312+
case b < 1<<30:
313+
suffix = "MiB"
314+
divisor = 1 << 20
315+
case b < 1<<40:
316+
suffix = "GiB"
317+
divisor = 1 << 30
318+
}
319+
if divisor == 1 {
320+
return fmt.Sprintf("%d %s", b, suffix)
321+
}
322+
return fmt.Sprintf("%.1f %s", float64(b)/float64(divisor), suffix)
323+
}

0 commit comments

Comments
 (0)