Skip to content

Commit 8d28bfe

Browse files
authored
cli: Add event stream capture to nomad operator debug (#11865)
1 parent dc81f26 commit 8d28bfe

File tree

4 files changed

+432
-7
lines changed

4 files changed

+432
-7
lines changed

.changelog/11865.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:improvement
2+
cli: Add event stream capture to `nomad operator debug`
3+
```

command/operator_debug.go

Lines changed: 224 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"context"
77
"crypto/tls"
88
"encoding/json"
9+
"errors"
910
"flag"
1011
"fmt"
1112
"html/template"
@@ -21,6 +22,7 @@ import (
2122
"time"
2223

2324
"github.com/hashicorp/go-cleanhttp"
25+
"github.com/hashicorp/go-multierror"
2426
"github.com/hashicorp/nomad/api"
2527
"github.com/hashicorp/nomad/api/contexts"
2628
"github.com/hashicorp/nomad/helper"
@@ -42,12 +44,15 @@ type OperatorDebugCommand struct {
4244
nodeClass string
4345
nodeIDs []string
4446
serverIDs []string
47+
topics map[api.Topic][]string
48+
index uint64
4549
consul *external
4650
vault *external
4751
manifest []string
4852
ctx context.Context
4953
cancel context.CancelFunc
5054
opts *api.QueryOptions
55+
verbose bool
5156
}
5257

5358
const (
@@ -73,6 +78,11 @@ Usage: nomad operator debug [options]
7378
token will also require 'agent:write', or enable_debug configuration set to
7479
true.
7580
81+
If event stream capture is enabled, the Job, Allocation, Deployment,
82+
and Evaluation topics require 'namespace:read-job' capabilities, the Node
83+
topic requires 'node:read'. A 'management' token is required to capture
84+
ACLToken, ACLPolicy, or all all events.
85+
7686
General Options:
7787
7888
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
@@ -137,7 +147,20 @@ Debug Options:
137147
138148
-duration=<duration>
139149
Set the duration of the debug capture. Logs will be captured from specified servers and
140-
nodes at "log-level". Defaults to 2m.
150+
nodes at "log-level". Defaults to 2m.
151+
152+
-event-index=<index>
153+
Specifies the index to start streaming events from. If the requested index is
154+
no longer in the buffer the stream will start at the next available index.
155+
Defaults to 0.
156+
157+
-event-topic=<Allocation,Evaluation,Job,Node,*>:<filter>
158+
Enable event stream capture, filtered by comma delimited list of topic filters.
159+
Examples:
160+
"all" or "*:*" for all events
161+
"Evaluation" or "Evaluation:*" for all evaluation events
162+
"*:example" for all events related to the job "example"
163+
Defaults to "none" (disabled).
141164
142165
-interval=<interval>
143166
The interval between snapshots of the Nomad state. Set interval equal to
@@ -173,7 +196,10 @@ Debug Options:
173196
174197
-output=<path>
175198
Path to the parent directory of the output directory. If specified, no
176-
archive is built. Defaults to the current directory.
199+
archive is built. Defaults to the current directory.
200+
201+
-verbose
202+
Enable verbose output.
177203
`
178204
return strings.TrimSpace(helpText)
179205
}
@@ -186,6 +212,8 @@ func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
186212
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
187213
complete.Flags{
188214
"-duration": complete.PredictAnything,
215+
"-event-index": complete.PredictAnything,
216+
"-event-topic": complete.PredictAnything,
189217
"-interval": complete.PredictAnything,
190218
"-log-level": complete.PredictSet("TRACE", "DEBUG", "INFO", "WARN", "ERROR"),
191219
"-max-nodes": complete.PredictAnything,
@@ -196,6 +224,7 @@ func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
196224
"-pprof-duration": complete.PredictAnything,
197225
"-consul-token": complete.PredictAnything,
198226
"-vault-token": complete.PredictAnything,
227+
"-verbose": complete.PredictAnything,
199228
})
200229
}
201230

@@ -225,7 +254,7 @@ func NodePredictor(factory ApiClientFactory) complete.Predictor {
225254
}
226255

227256
// NodeClassPredictor returns a client node class predictor
228-
// TODO: Consider API options for node class filtering
257+
// TODO dmay: Consider API options for node class filtering
229258
func NodeClassPredictor(factory ApiClientFactory) complete.Predictor {
230259
return complete.PredictFunc(func(a complete.Args) []string {
231260
client, err := factory()
@@ -261,7 +290,7 @@ func NodeClassPredictor(factory ApiClientFactory) complete.Predictor {
261290
}
262291

263292
// ServerPredictor returns a server member predictor
264-
// TODO: Consider API options for server member filtering
293+
// TODO dmay: Consider API options for server member filtering
265294
func ServerPredictor(factory ApiClientFactory) complete.Predictor {
266295
return complete.PredictFunc(func(a complete.Args) []string {
267296
client, err := factory()
@@ -305,11 +334,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
305334
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
306335
flags.Usage = func() { c.Ui.Output(c.Help()) }
307336

308-
var duration, interval, output, pprofDuration string
337+
var duration, interval, output, pprofDuration, eventTopic string
338+
var eventIndex int64
309339
var nodeIDs, serverIDs string
310340
var allowStale bool
311341

312342
flags.StringVar(&duration, "duration", "2m", "")
343+
flags.Int64Var(&eventIndex, "event-index", 0, "")
344+
flags.StringVar(&eventTopic, "event-topic", "none", "")
313345
flags.StringVar(&interval, "interval", "30s", "")
314346
flags.StringVar(&c.logLevel, "log-level", "DEBUG", "")
315347
flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
@@ -319,6 +351,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
319351
flags.BoolVar(&allowStale, "stale", false, "")
320352
flags.StringVar(&output, "output", "", "")
321353
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
354+
flags.BoolVar(&c.verbose, "verbose", false, "")
322355

323356
c.consul = &external{tls: &api.TLSConfig{}}
324357
flags.StringVar(&c.consul.addrVal, "consul-http-addr", os.Getenv("CONSUL_HTTP_ADDR"), "")
@@ -375,6 +408,21 @@ func (c *OperatorDebugCommand) Run(args []string) int {
375408
}
376409
c.pprofDuration = pd
377410

411+
// Parse event stream topic filter
412+
t, err := topicsFromString(eventTopic)
413+
if err != nil {
414+
c.Ui.Error(fmt.Sprintf("Error parsing event topics: %v", err))
415+
return 1
416+
}
417+
c.topics = t
418+
419+
// Validate and set initial event stream index
420+
if eventIndex < 0 {
421+
c.Ui.Error("Event stream index must be greater than zero")
422+
return 1
423+
}
424+
c.index = uint64(eventIndex)
425+
378426
// Verify there are no extra arguments
379427
args = flags.Args()
380428
if l := len(args); l != 0 {
@@ -550,6 +598,9 @@ func (c *OperatorDebugCommand) Run(args []string) int {
550598
if c.pprofDuration.Seconds() != 1 {
551599
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
552600
}
601+
if c.topics != nil {
602+
c.Ui.Output(fmt.Sprintf(" Event topics: %+v", c.topics))
603+
}
553604
c.Ui.Output("")
554605
c.Ui.Output("Capturing cluster data...")
555606

@@ -584,8 +635,11 @@ func (c *OperatorDebugCommand) Run(args []string) int {
584635

585636
// collect collects data from our endpoints and writes the archive bundle
586637
func (c *OperatorDebugCommand) collect(client *api.Client) error {
587-
// Collect cluster data
638+
// Start background captures
639+
c.startMonitors(client)
640+
c.startEventStream(client)
588641

642+
// Collect cluster data
589643
self, err := client.Agent().Self()
590644
c.writeJSON(clusterDir, "agent-self.json", self, err)
591645

@@ -611,7 +665,6 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
611665
c.collectAgentHosts(client)
612666
c.collectPprofs(client)
613667

614-
c.startMonitors(client)
615668
c.collectPeriodic(client)
616669

617670
return nil
@@ -686,6 +739,103 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
686739
}
687740
}
688741

742+
// captureEventStream wraps the event stream capture process.
743+
func (c *OperatorDebugCommand) startEventStream(client *api.Client) {
744+
c.verboseOut("Launching eventstream goroutine...")
745+
746+
go func() {
747+
if err := c.captureEventStream(client); err != nil {
748+
var es string
749+
if mErr, ok := err.(*multierror.Error); ok {
750+
es = multierror.ListFormatFunc(mErr.Errors)
751+
} else {
752+
es = err.Error()
753+
}
754+
755+
c.Ui.Error(fmt.Sprintf("Error capturing event stream: %s", es))
756+
}
757+
}()
758+
}
759+
760+
func (c *OperatorDebugCommand) captureEventStream(client *api.Client) error {
761+
// Ensure output directory is present
762+
path := clusterDir
763+
if err := c.mkdir(c.path(path)); err != nil {
764+
return err
765+
}
766+
767+
// Create the output file
768+
fh, err := os.Create(c.path(path, "eventstream.json"))
769+
if err != nil {
770+
return err
771+
}
772+
defer fh.Close()
773+
774+
// Get handle to events endpoint
775+
events := client.EventStream()
776+
777+
// Start streaming events
778+
eventCh, err := events.Stream(c.ctx, c.topics, c.index, c.queryOpts())
779+
if err != nil {
780+
if errors.Is(err, context.Canceled) {
781+
c.verboseOut("Event stream canceled: No events captured")
782+
return nil
783+
}
784+
return fmt.Errorf("failed to stream events: %w", err)
785+
}
786+
787+
eventCount := 0
788+
errCount := 0
789+
heartbeatCount := 0
790+
channelEventCount := 0
791+
792+
var mErrs *multierror.Error
793+
794+
for {
795+
select {
796+
case event := <-eventCh:
797+
channelEventCount++
798+
if event.Err != nil {
799+
errCount++
800+
c.verboseOutf("error from event stream: index; %d err: %v", event.Index, event.Err)
801+
mErrs = multierror.Append(mErrs, fmt.Errorf("error at index: %d, Err: %w", event.Index, event.Err))
802+
break
803+
}
804+
805+
if event.IsHeartbeat() {
806+
heartbeatCount++
807+
continue
808+
}
809+
810+
for _, e := range event.Events {
811+
eventCount++
812+
c.verboseOutf("Event: %4d, Index: %d, Topic: %-10s, Type: %s, FilterKeys: %s", eventCount, e.Index, e.Topic, e.Type, e.FilterKeys)
813+
814+
bytes, err := json.Marshal(e)
815+
if err != nil {
816+
errCount++
817+
mErrs = multierror.Append(mErrs, fmt.Errorf("failed to marshal json from Topic: %s, Type: %s, Err: %w", e.Topic, e.Type, err))
818+
}
819+
820+
n, err := fh.Write(bytes)
821+
if err != nil {
822+
errCount++
823+
mErrs = multierror.Append(mErrs, fmt.Errorf("failed to write bytes to eventstream.json; bytes written: %d, Err: %w", n, err))
824+
break
825+
}
826+
n, err = fh.WriteString("\n")
827+
if err != nil {
828+
errCount++
829+
mErrs = multierror.Append(mErrs, fmt.Errorf("failed to write string to eventstream.json; chars written: %d, Err: %w", n, err))
830+
}
831+
}
832+
case <-c.ctx.Done():
833+
c.verboseOutf("Event stream captured %d events, %d frames, %d heartbeats, %d errors", eventCount, channelEventCount, heartbeatCount, errCount)
834+
return mErrs.ErrorOrNil()
835+
}
836+
}
837+
}
838+
689839
// collectAgentHosts calls collectAgentHost for each selected node
690840
func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) {
691841
for _, n := range c.nodeIDs {
@@ -1192,6 +1342,16 @@ func (c *OperatorDebugCommand) trap() {
11921342
}()
11931343
}
11941344

1345+
func (c *OperatorDebugCommand) verboseOut(out string) {
1346+
if c.verbose {
1347+
c.Ui.Output(out)
1348+
}
1349+
}
1350+
1351+
func (c *OperatorDebugCommand) verboseOutf(format string, a ...interface{}) {
1352+
c.verboseOut(fmt.Sprintf(format, a...))
1353+
}
1354+
11951355
// TarCZF like the tar command, recursively builds a gzip compressed tar
11961356
// archive from a directory. If not empty, all files in the bundle are prefixed
11971357
// with the target path.
@@ -1312,6 +1472,63 @@ func stringToSlice(input string) []string {
13121472
return out
13131473
}
13141474

1475+
func parseEventTopics(topicList []string) (map[api.Topic][]string, error) {
1476+
topics := make(map[api.Topic][]string)
1477+
1478+
var mErrs *multierror.Error
1479+
1480+
for _, topic := range topicList {
1481+
k, v, err := parseTopic(topic)
1482+
if err != nil {
1483+
mErrs = multierror.Append(mErrs, err)
1484+
}
1485+
1486+
topics[api.Topic(k)] = append(topics[api.Topic(k)], v)
1487+
}
1488+
1489+
return topics, mErrs.ErrorOrNil()
1490+
}
1491+
1492+
func parseTopic(input string) (string, string, error) {
1493+
var topic, filter string
1494+
1495+
parts := strings.Split(input, ":")
1496+
switch len(parts) {
1497+
case 1:
1498+
// infer wildcard if only given a topic
1499+
topic = input
1500+
filter = "*"
1501+
case 2:
1502+
topic = parts[0]
1503+
filter = parts[1]
1504+
default:
1505+
return "", "", fmt.Errorf("Invalid key value pair for topic: %s", topic)
1506+
}
1507+
1508+
return strings.Title(topic), filter, nil
1509+
}
1510+
1511+
func allTopics() map[api.Topic][]string {
1512+
return map[api.Topic][]string{"*": {"*"}}
1513+
}
1514+
1515+
// topicsFromString parses a comma separated list into a topicMap
1516+
func topicsFromString(topicList string) (map[api.Topic][]string, error) {
1517+
if topicList == "none" {
1518+
return nil, nil
1519+
}
1520+
if topicList == "all" {
1521+
return allTopics(), nil
1522+
}
1523+
1524+
topics := stringToSlice(topicList)
1525+
topicMap, err := parseEventTopics(topics)
1526+
if err != nil {
1527+
return nil, err
1528+
}
1529+
return topicMap, nil
1530+
}
1531+
13151532
// external holds address configuration for Consul and Vault APIs
13161533
type external struct {
13171534
tls *api.TLSConfig

0 commit comments

Comments
 (0)