6
6
"context"
7
7
"crypto/tls"
8
8
"encoding/json"
9
+ "errors"
9
10
"flag"
10
11
"fmt"
11
12
"html/template"
@@ -21,6 +22,7 @@ import (
21
22
"time"
22
23
23
24
"github.com/hashicorp/go-cleanhttp"
25
+ "github.com/hashicorp/go-multierror"
24
26
"github.com/hashicorp/nomad/api"
25
27
"github.com/hashicorp/nomad/api/contexts"
26
28
"github.com/hashicorp/nomad/helper"
@@ -42,12 +44,15 @@ type OperatorDebugCommand struct {
42
44
nodeClass string
43
45
nodeIDs []string
44
46
serverIDs []string
47
+ topics map [api.Topic ][]string
48
+ index uint64
45
49
consul * external
46
50
vault * external
47
51
manifest []string
48
52
ctx context.Context
49
53
cancel context.CancelFunc
50
54
opts * api.QueryOptions
55
+ verbose bool
51
56
}
52
57
53
58
const (
@@ -73,6 +78,11 @@ Usage: nomad operator debug [options]
73
78
token will also require 'agent:write', or enable_debug configuration set to
74
79
true.
75
80
81
+ If event stream capture is enabled, the Job, Allocation, Deployment,
82
+ and Evaluation topics require 'namespace:read-job' capabilities, the Node
83
+ topic requires 'node:read'. A 'management' token is required to capture
84
+ ACLToken, ACLPolicy, or all all events.
85
+
76
86
General Options:
77
87
78
88
` + generalOptionsUsage (usageOptsDefault | usageOptsNoNamespace ) + `
@@ -137,7 +147,20 @@ Debug Options:
137
147
138
148
-duration=<duration>
139
149
Set the duration of the debug capture. Logs will be captured from specified servers and
140
- nodes at "log-level". Defaults to 2m.
150
+ nodes at "log-level". Defaults to 2m.
151
+
152
+ -event-index=<index>
153
+ Specifies the index to start streaming events from. If the requested index is
154
+ no longer in the buffer the stream will start at the next available index.
155
+ Defaults to 0.
156
+
157
+ -event-topic=<Allocation,Evaluation,Job,Node,*>:<filter>
158
+ Enable event stream capture, filtered by comma delimited list of topic filters.
159
+ Examples:
160
+ "all" or "*:*" for all events
161
+ "Evaluation" or "Evaluation:*" for all evaluation events
162
+ "*:example" for all events related to the job "example"
163
+ Defaults to "none" (disabled).
141
164
142
165
-interval=<interval>
143
166
The interval between snapshots of the Nomad state. Set interval equal to
@@ -173,7 +196,10 @@ Debug Options:
173
196
174
197
-output=<path>
175
198
Path to the parent directory of the output directory. If specified, no
176
- archive is built. Defaults to the current directory.
199
+ archive is built. Defaults to the current directory.
200
+
201
+ -verbose
202
+ Enable verbose output.
177
203
`
178
204
return strings .TrimSpace (helpText )
179
205
}
@@ -186,6 +212,8 @@ func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
186
212
return mergeAutocompleteFlags (c .Meta .AutocompleteFlags (FlagSetClient ),
187
213
complete.Flags {
188
214
"-duration" : complete .PredictAnything ,
215
+ "-event-index" : complete .PredictAnything ,
216
+ "-event-topic" : complete .PredictAnything ,
189
217
"-interval" : complete .PredictAnything ,
190
218
"-log-level" : complete .PredictSet ("TRACE" , "DEBUG" , "INFO" , "WARN" , "ERROR" ),
191
219
"-max-nodes" : complete .PredictAnything ,
@@ -196,6 +224,7 @@ func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
196
224
"-pprof-duration" : complete .PredictAnything ,
197
225
"-consul-token" : complete .PredictAnything ,
198
226
"-vault-token" : complete .PredictAnything ,
227
+ "-verbose" : complete .PredictAnything ,
199
228
})
200
229
}
201
230
@@ -225,7 +254,7 @@ func NodePredictor(factory ApiClientFactory) complete.Predictor {
225
254
}
226
255
227
256
// NodeClassPredictor returns a client node class predictor
228
- // TODO: Consider API options for node class filtering
257
+ // TODO dmay : Consider API options for node class filtering
229
258
func NodeClassPredictor (factory ApiClientFactory ) complete.Predictor {
230
259
return complete .PredictFunc (func (a complete.Args ) []string {
231
260
client , err := factory ()
@@ -261,7 +290,7 @@ func NodeClassPredictor(factory ApiClientFactory) complete.Predictor {
261
290
}
262
291
263
292
// ServerPredictor returns a server member predictor
264
- // TODO: Consider API options for server member filtering
293
+ // TODO dmay : Consider API options for server member filtering
265
294
func ServerPredictor (factory ApiClientFactory ) complete.Predictor {
266
295
return complete .PredictFunc (func (a complete.Args ) []string {
267
296
client , err := factory ()
@@ -305,11 +334,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
305
334
flags := c .Meta .FlagSet (c .Name (), FlagSetClient )
306
335
flags .Usage = func () { c .Ui .Output (c .Help ()) }
307
336
308
- var duration , interval , output , pprofDuration string
337
+ var duration , interval , output , pprofDuration , eventTopic string
338
+ var eventIndex int64
309
339
var nodeIDs , serverIDs string
310
340
var allowStale bool
311
341
312
342
flags .StringVar (& duration , "duration" , "2m" , "" )
343
+ flags .Int64Var (& eventIndex , "event-index" , 0 , "" )
344
+ flags .StringVar (& eventTopic , "event-topic" , "none" , "" )
313
345
flags .StringVar (& interval , "interval" , "30s" , "" )
314
346
flags .StringVar (& c .logLevel , "log-level" , "DEBUG" , "" )
315
347
flags .IntVar (& c .maxNodes , "max-nodes" , 10 , "" )
@@ -319,6 +351,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
319
351
flags .BoolVar (& allowStale , "stale" , false , "" )
320
352
flags .StringVar (& output , "output" , "" , "" )
321
353
flags .StringVar (& pprofDuration , "pprof-duration" , "1s" , "" )
354
+ flags .BoolVar (& c .verbose , "verbose" , false , "" )
322
355
323
356
c .consul = & external {tls : & api.TLSConfig {}}
324
357
flags .StringVar (& c .consul .addrVal , "consul-http-addr" , os .Getenv ("CONSUL_HTTP_ADDR" ), "" )
@@ -375,6 +408,21 @@ func (c *OperatorDebugCommand) Run(args []string) int {
375
408
}
376
409
c .pprofDuration = pd
377
410
411
+ // Parse event stream topic filter
412
+ t , err := topicsFromString (eventTopic )
413
+ if err != nil {
414
+ c .Ui .Error (fmt .Sprintf ("Error parsing event topics: %v" , err ))
415
+ return 1
416
+ }
417
+ c .topics = t
418
+
419
+ // Validate and set initial event stream index
420
+ if eventIndex < 0 {
421
+ c .Ui .Error ("Event stream index must be greater than zero" )
422
+ return 1
423
+ }
424
+ c .index = uint64 (eventIndex )
425
+
378
426
// Verify there are no extra arguments
379
427
args = flags .Args ()
380
428
if l := len (args ); l != 0 {
@@ -550,6 +598,9 @@ func (c *OperatorDebugCommand) Run(args []string) int {
550
598
if c .pprofDuration .Seconds () != 1 {
551
599
c .Ui .Output (fmt .Sprintf (" pprof Duration: %s" , c .pprofDuration ))
552
600
}
601
+ if c .topics != nil {
602
+ c .Ui .Output (fmt .Sprintf (" Event topics: %+v" , c .topics ))
603
+ }
553
604
c .Ui .Output ("" )
554
605
c .Ui .Output ("Capturing cluster data..." )
555
606
@@ -584,8 +635,11 @@ func (c *OperatorDebugCommand) Run(args []string) int {
584
635
585
636
// collect collects data from our endpoints and writes the archive bundle
586
637
func (c * OperatorDebugCommand ) collect (client * api.Client ) error {
587
- // Collect cluster data
638
+ // Start background captures
639
+ c .startMonitors (client )
640
+ c .startEventStream (client )
588
641
642
+ // Collect cluster data
589
643
self , err := client .Agent ().Self ()
590
644
c .writeJSON (clusterDir , "agent-self.json" , self , err )
591
645
@@ -611,7 +665,6 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
611
665
c .collectAgentHosts (client )
612
666
c .collectPprofs (client )
613
667
614
- c .startMonitors (client )
615
668
c .collectPeriodic (client )
616
669
617
670
return nil
@@ -686,6 +739,103 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
686
739
}
687
740
}
688
741
742
+ // captureEventStream wraps the event stream capture process.
743
+ func (c * OperatorDebugCommand ) startEventStream (client * api.Client ) {
744
+ c .verboseOut ("Launching eventstream goroutine..." )
745
+
746
+ go func () {
747
+ if err := c .captureEventStream (client ); err != nil {
748
+ var es string
749
+ if mErr , ok := err .(* multierror.Error ); ok {
750
+ es = multierror .ListFormatFunc (mErr .Errors )
751
+ } else {
752
+ es = err .Error ()
753
+ }
754
+
755
+ c .Ui .Error (fmt .Sprintf ("Error capturing event stream: %s" , es ))
756
+ }
757
+ }()
758
+ }
759
+
760
+ func (c * OperatorDebugCommand ) captureEventStream (client * api.Client ) error {
761
+ // Ensure output directory is present
762
+ path := clusterDir
763
+ if err := c .mkdir (c .path (path )); err != nil {
764
+ return err
765
+ }
766
+
767
+ // Create the output file
768
+ fh , err := os .Create (c .path (path , "eventstream.json" ))
769
+ if err != nil {
770
+ return err
771
+ }
772
+ defer fh .Close ()
773
+
774
+ // Get handle to events endpoint
775
+ events := client .EventStream ()
776
+
777
+ // Start streaming events
778
+ eventCh , err := events .Stream (c .ctx , c .topics , c .index , c .queryOpts ())
779
+ if err != nil {
780
+ if errors .Is (err , context .Canceled ) {
781
+ c .verboseOut ("Event stream canceled: No events captured" )
782
+ return nil
783
+ }
784
+ return fmt .Errorf ("failed to stream events: %w" , err )
785
+ }
786
+
787
+ eventCount := 0
788
+ errCount := 0
789
+ heartbeatCount := 0
790
+ channelEventCount := 0
791
+
792
+ var mErrs * multierror.Error
793
+
794
+ for {
795
+ select {
796
+ case event := <- eventCh :
797
+ channelEventCount ++
798
+ if event .Err != nil {
799
+ errCount ++
800
+ c .verboseOutf ("error from event stream: index; %d err: %v" , event .Index , event .Err )
801
+ mErrs = multierror .Append (mErrs , fmt .Errorf ("error at index: %d, Err: %w" , event .Index , event .Err ))
802
+ break
803
+ }
804
+
805
+ if event .IsHeartbeat () {
806
+ heartbeatCount ++
807
+ continue
808
+ }
809
+
810
+ for _ , e := range event .Events {
811
+ eventCount ++
812
+ c .verboseOutf ("Event: %4d, Index: %d, Topic: %-10s, Type: %s, FilterKeys: %s" , eventCount , e .Index , e .Topic , e .Type , e .FilterKeys )
813
+
814
+ bytes , err := json .Marshal (e )
815
+ if err != nil {
816
+ errCount ++
817
+ mErrs = multierror .Append (mErrs , fmt .Errorf ("failed to marshal json from Topic: %s, Type: %s, Err: %w" , e .Topic , e .Type , err ))
818
+ }
819
+
820
+ n , err := fh .Write (bytes )
821
+ if err != nil {
822
+ errCount ++
823
+ mErrs = multierror .Append (mErrs , fmt .Errorf ("failed to write bytes to eventstream.json; bytes written: %d, Err: %w" , n , err ))
824
+ break
825
+ }
826
+ n , err = fh .WriteString ("\n " )
827
+ if err != nil {
828
+ errCount ++
829
+ mErrs = multierror .Append (mErrs , fmt .Errorf ("failed to write string to eventstream.json; chars written: %d, Err: %w" , n , err ))
830
+ }
831
+ }
832
+ case <- c .ctx .Done ():
833
+ c .verboseOutf ("Event stream captured %d events, %d frames, %d heartbeats, %d errors" , eventCount , channelEventCount , heartbeatCount , errCount )
834
+ return mErrs .ErrorOrNil ()
835
+ }
836
+ }
837
+ }
838
+
689
839
// collectAgentHosts calls collectAgentHost for each selected node
690
840
func (c * OperatorDebugCommand ) collectAgentHosts (client * api.Client ) {
691
841
for _ , n := range c .nodeIDs {
@@ -1192,6 +1342,16 @@ func (c *OperatorDebugCommand) trap() {
1192
1342
}()
1193
1343
}
1194
1344
1345
+ func (c * OperatorDebugCommand ) verboseOut (out string ) {
1346
+ if c .verbose {
1347
+ c .Ui .Output (out )
1348
+ }
1349
+ }
1350
+
1351
+ func (c * OperatorDebugCommand ) verboseOutf (format string , a ... interface {}) {
1352
+ c .verboseOut (fmt .Sprintf (format , a ... ))
1353
+ }
1354
+
1195
1355
// TarCZF like the tar command, recursively builds a gzip compressed tar
1196
1356
// archive from a directory. If not empty, all files in the bundle are prefixed
1197
1357
// with the target path.
@@ -1312,6 +1472,63 @@ func stringToSlice(input string) []string {
1312
1472
return out
1313
1473
}
1314
1474
1475
+ func parseEventTopics (topicList []string ) (map [api.Topic ][]string , error ) {
1476
+ topics := make (map [api.Topic ][]string )
1477
+
1478
+ var mErrs * multierror.Error
1479
+
1480
+ for _ , topic := range topicList {
1481
+ k , v , err := parseTopic (topic )
1482
+ if err != nil {
1483
+ mErrs = multierror .Append (mErrs , err )
1484
+ }
1485
+
1486
+ topics [api .Topic (k )] = append (topics [api .Topic (k )], v )
1487
+ }
1488
+
1489
+ return topics , mErrs .ErrorOrNil ()
1490
+ }
1491
+
1492
+ func parseTopic (input string ) (string , string , error ) {
1493
+ var topic , filter string
1494
+
1495
+ parts := strings .Split (input , ":" )
1496
+ switch len (parts ) {
1497
+ case 1 :
1498
+ // infer wildcard if only given a topic
1499
+ topic = input
1500
+ filter = "*"
1501
+ case 2 :
1502
+ topic = parts [0 ]
1503
+ filter = parts [1 ]
1504
+ default :
1505
+ return "" , "" , fmt .Errorf ("Invalid key value pair for topic: %s" , topic )
1506
+ }
1507
+
1508
+ return strings .Title (topic ), filter , nil
1509
+ }
1510
+
1511
+ func allTopics () map [api.Topic ][]string {
1512
+ return map [api.Topic ][]string {"*" : {"*" }}
1513
+ }
1514
+
1515
+ // topicsFromString parses a comma separated list into a topicMap
1516
+ func topicsFromString (topicList string ) (map [api.Topic ][]string , error ) {
1517
+ if topicList == "none" {
1518
+ return nil , nil
1519
+ }
1520
+ if topicList == "all" {
1521
+ return allTopics (), nil
1522
+ }
1523
+
1524
+ topics := stringToSlice (topicList )
1525
+ topicMap , err := parseEventTopics (topics )
1526
+ if err != nil {
1527
+ return nil , err
1528
+ }
1529
+ return topicMap , nil
1530
+ }
1531
+
1315
1532
// external holds address configuration for Consul and Vault APIs
1316
1533
type external struct {
1317
1534
tls * api.TLSConfig
0 commit comments