From 0c27588a151312581929b3b228fe482957f78cb3 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 2 Jul 2025 15:13:19 -0400 Subject: [PATCH] scheduler: add reconciler annotations to completed evals The output of the reconciler stage of scheduling is only visible via debug-level logs, typically accessible only to the cluster admin. We can give job authors better ability to understand what's happening to their jobs if we expose this information to them in the `eval status` command. Add the reconciler's desired updates to the evaluation struct so it can be exposed in the API. This increases the size of evals by roughly 15% in the state store, or a bit more when there are preemptions (but we expect this will be a small minority of evals). Ref: https://hashicorp.atlassian.net/browse/NMD-818 Fixes: https://github.com/hashicorp/nomad/issues/15564 --- .changelog/26188.txt | 3 ++ api/evaluations.go | 1 + command/eval_status.go | 55 ++++++++++++++++++++++++++++++ command/eval_status_test.go | 18 ++++++++++ nomad/structs/structs.go | 3 ++ scheduler/generic_sched.go | 30 ++++++++-------- scheduler/generic_sched_test.go | 4 ++- scheduler/scheduler_system.go | 37 +++++++++++--------- scheduler/scheduler_system_test.go | 4 ++- scheduler/util.go | 5 ++- scheduler/util_test.go | 24 +++++++++---- 11 files changed, 144 insertions(+), 40 deletions(-) create mode 100644 .changelog/26188.txt diff --git a/.changelog/26188.txt b/.changelog/26188.txt new file mode 100644 index 00000000000..638c40be2b7 --- /dev/null +++ b/.changelog/26188.txt @@ -0,0 +1,3 @@ +```release-note:improvement +scheduler: Add reconciler annotations to the output of the `eval status` command +``` diff --git a/api/evaluations.go b/api/evaluations.go index 3c5440965c6..bd59b44c669 100644 --- a/api/evaluations.go +++ b/api/evaluations.go @@ -116,6 +116,7 @@ type Evaluation struct { BlockedEval string RelatedEvals []*EvaluationStub FailedTGAllocs map[string]*AllocationMetric + PlanAnnotations *PlanAnnotations ClassEligibility map[string]bool EscapedComputedClass bool QuotaLimitReached string diff --git a/command/eval_status.go b/command/eval_status.go index dc32182dca6..a4b1eb59c9e 100644 --- a/command/eval_status.go +++ b/command/eval_status.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/api/contexts" "github.com/posener/complete" + "github.com/ryanuber/columnize" ) type EvalStatusCommand struct { @@ -257,6 +258,36 @@ func (c *EvalStatusCommand) formatEvalStatus(eval *api.Evaluation, placedAllocs c.Ui.Output(c.Colorize().Color("\n[bold]Related Evaluations[reset]")) c.Ui.Output(formatRelatedEvalStubs(eval.RelatedEvals, length)) } + if eval.PlanAnnotations != nil { + + if len(eval.PlanAnnotations.DesiredTGUpdates) > 0 { + c.Ui.Output(c.Colorize().Color("\n[bold]Reconciler Annotations[reset]")) + annotations := make([]string, len(eval.PlanAnnotations.DesiredTGUpdates)+1) + annotations[0] = "Task Group|Ignore|Place|Stop|Migrate|InPlace|Destructive|Canary|Preemptions" + i := 1 + for tg, updates := range eval.PlanAnnotations.DesiredTGUpdates { + annotations[i] = fmt.Sprintf("%s|%d|%d|%d|%d|%d|%d|%d|%d", + tg, + updates.Ignore, + updates.Place, + updates.Stop, + updates.Migrate, + updates.InPlaceUpdate, + updates.DestructiveUpdate, + updates.Canary, + updates.Preemptions, + ) + i++ + } + c.Ui.Output(columnize.SimpleFormat(annotations)) + } + + if len(eval.PlanAnnotations.PreemptedAllocs) > 0 { + c.Ui.Output(c.Colorize().Color("\n[bold]Preempted Allocations[reset]")) + allocsOut := formatPreemptedAllocListStubs(eval.PlanAnnotations.PreemptedAllocs, length) + c.Ui.Output(allocsOut) + } + } if len(placedAllocs) > 0 { c.Ui.Output(c.Colorize().Color("\n[bold]Placed Allocations[reset]")) allocsOut := formatAllocListStubs(placedAllocs, false, length) @@ -323,3 +354,27 @@ func formatRelatedEvalStubs(evals []*api.EvaluationStub, length int) string { return formatList(out) } + +// formatPreemptedAllocListStubs formats alloc stubs but assumes they don't all +// belong to the same job, as is the case when allocs are preempted by another +// job +func formatPreemptedAllocListStubs(stubs []*api.AllocationListStub, uuidLength int) string { + allocs := make([]string, len(stubs)+1) + allocs[0] = "ID|Job ID|Node ID|Task Group|Version|Desired|Status|Created|Modified" + for i, alloc := range stubs { + now := time.Now() + createTimePretty := prettyTimeDiff(time.Unix(0, alloc.CreateTime), now) + modTimePretty := prettyTimeDiff(time.Unix(0, alloc.ModifyTime), now) + allocs[i+1] = fmt.Sprintf("%s|%s|%s|%s|%d|%s|%s|%s|%s", + limit(alloc.ID, uuidLength), + alloc.JobID, + limit(alloc.NodeID, uuidLength), + alloc.TaskGroup, + alloc.JobVersion, + alloc.DesiredStatus, + alloc.ClientStatus, + createTimePretty, + modTimePretty) + } + return formatList(allocs) +} diff --git a/command/eval_status_test.go b/command/eval_status_test.go index b09ac051274..77578b99d8d 100644 --- a/command/eval_status_test.go +++ b/command/eval_status_test.go @@ -150,6 +150,22 @@ func TestEvalStatusCommand_Format(t *testing.T) { CoalescedFailures: 0, ScoreMetaData: []*api.NodeScoreMeta{}, }}, + PlanAnnotations: &api.PlanAnnotations{ + DesiredTGUpdates: map[string]*api.DesiredUpdates{"web": {Place: 10}}, + PreemptedAllocs: []*api.AllocationListStub{ + { + ID: uuid.Generate(), + JobID: "another", + NodeID: uuid.Generate(), + TaskGroup: "db", + DesiredStatus: "evict", + JobVersion: 3, + ClientStatus: "complete", + CreateTime: now.Add(-10 * time.Minute).UnixNano(), + ModifyTime: now.Add(-2 * time.Second).UnixNano(), + }, + }, + }, ClassEligibility: map[string]bool{}, EscapedComputedClass: true, QuotaLimitReached: "", @@ -207,4 +223,6 @@ Task Group "web" (failed to place 1 allocation): must.StrContains(t, out, `Related Evaluations`) must.StrContains(t, out, `Placed Allocations`) + must.StrContains(t, out, `Reconciler Annotations`) + must.StrContains(t, out, `Preempted Allocations`) } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index e500192cd73..6f08d54e565 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -12428,6 +12428,9 @@ type Evaluation struct { // to determine the cause. FailedTGAllocs map[string]*AllocMetric + // PlanAnnotations represents the output of the reconciliation step. + PlanAnnotations *PlanAnnotations + // ClassEligibility tracks computed node classes that have been explicitly // marked as eligible or ineligible. ClassEligibility map[string]bool diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 882f3c102c1..e42b1aefcd8 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -73,9 +73,10 @@ type GenericScheduler struct { deployment *structs.Deployment - blocked *structs.Evaluation - failedTGAllocs map[string]*structs.AllocMetric - queuedAllocs map[string]int + blocked *structs.Evaluation + failedTGAllocs map[string]*structs.AllocMetric + queuedAllocs map[string]int + planAnnotations *structs.PlanAnnotations } // NewServiceScheduler is a factory function to instantiate a new service scheduler @@ -132,7 +133,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, - s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, + s.failedTGAllocs, s.planAnnotations, structs.EvalStatusFailed, desc, s.queuedAllocs, s.deployment.GetID()) } @@ -151,7 +152,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { mErr.Errors = append(mErr.Errors, err) } if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked, - s.failedTGAllocs, statusErr.EvalStatus, err.Error(), + s.failedTGAllocs, s.planAnnotations, statusErr.EvalStatus, err.Error(), s.queuedAllocs, s.deployment.GetID()); err != nil { mErr.Errors = append(mErr.Errors, err) } @@ -173,7 +174,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { // Update the status to complete return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, - s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs, + s.failedTGAllocs, s.planAnnotations, structs.EvalStatusComplete, "", s.queuedAllocs, s.deployment.GetID()) } @@ -285,6 +286,9 @@ func (s *GenericScheduler) process() (bool, error) { } // Submit the plan and store the results. + if s.eval.AnnotatePlan { + s.plan.Annotations = s.planAnnotations + } result, newState, err := s.planner.SubmitPlan(s.plan) s.planResult = result if err != nil { @@ -359,10 +363,8 @@ func (s *GenericScheduler) computeJobAllocs() error { s.logger.Debug("reconciled current state with desired state", result.Fields()...) } - if s.eval.AnnotatePlan { - s.plan.Annotations = &structs.PlanAnnotations{ - DesiredTGUpdates: result.DesiredTGUpdates, - } + s.planAnnotations = &structs.PlanAnnotations{ + DesiredTGUpdates: result.DesiredTGUpdates, } // Add the deployment changes to the plan @@ -912,10 +914,10 @@ func (s *GenericScheduler) handlePreemptions(option *feasible.RankedNode, alloc s.plan.AppendPreemptedAlloc(stop, alloc.ID) preemptedAllocIDs = append(preemptedAllocIDs, stop.ID) - if s.eval.AnnotatePlan && s.plan.Annotations != nil { - s.plan.Annotations.PreemptedAllocs = append(s.plan.Annotations.PreemptedAllocs, stop.Stub(nil)) - if s.plan.Annotations.DesiredTGUpdates != nil { - desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup().Name] + if s.planAnnotations != nil { + s.planAnnotations.PreemptedAllocs = append(s.planAnnotations.PreemptedAllocs, stop.Stub(nil)) + if s.planAnnotations.DesiredTGUpdates != nil { + desired := s.planAnnotations.DesiredTGUpdates[missing.TaskGroup().Name] desired.Preemptions += 1 } } diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 071bf3e0f03..3ad5f22b076 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -64,10 +64,12 @@ func TestServiceSched_JobRegister(t *testing.T) { } plan := h.Plans[0] - // Ensure the plan doesn't have annotations. + // Ensure the plan doesn't have annotations but the eval does if plan.Annotations != nil { t.Fatalf("expected no annotations") } + must.SliceNotEmpty(t, h.Evals) + must.Eq(t, 10, h.Evals[0].PlanAnnotations.DesiredTGUpdates["web"].Place) // Ensure the eval has no spawned blocked eval if len(h.CreateEvals) != 0 { diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 131cec825db..862dcf261f2 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -52,8 +52,9 @@ type SystemScheduler struct { limitReached bool nextEval *structs.Evaluation - failedTGAllocs map[string]*structs.AllocMetric - queuedAllocs map[string]int + failedTGAllocs map[string]*structs.AllocMetric + queuedAllocs map[string]int + planAnnotations *structs.PlanAnnotations } // NewSystemScheduler is a factory function to instantiate a new system @@ -97,7 +98,8 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { // Verify the evaluation trigger reason is understood if !s.canHandle(eval.TriggeredBy) { desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) - return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, + return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, + s.failedTGAllocs, s.planAnnotations, structs.EvalStatusFailed, desc, s.queuedAllocs, "") } @@ -110,14 +112,16 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { progress := func() bool { return progressMade(s.planResult) } if err := retryMax(limit, s.process, progress); err != nil { if statusErr, ok := err.(*SetStatusError); ok { - return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), + return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, + s.failedTGAllocs, s.planAnnotations, statusErr.EvalStatus, err.Error(), s.queuedAllocs, "") } return err } // Update the status to complete - return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "", + return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, + s.failedTGAllocs, s.planAnnotations, structs.EvalStatusComplete, "", s.queuedAllocs, "") } @@ -186,6 +190,9 @@ func (s *SystemScheduler) process() (bool, error) { } // Submit the plan + if s.eval.AnnotatePlan { + s.plan.Annotations = s.planAnnotations + } result, newState, err := s.planner.SubmitPlan(s.plan) s.planResult = result if err != nil { @@ -298,10 +305,8 @@ func (s *SystemScheduler) computeJobAllocs() error { allocExistsForTaskGroup[inplaceUpdate.TaskGroup.Name] = true } - if s.eval.AnnotatePlan { - s.plan.Annotations = &structs.PlanAnnotations{ - DesiredTGUpdates: desiredUpdates(r, inplaceUpdates, destructiveUpdates), - } + s.planAnnotations = &structs.PlanAnnotations{ + DesiredTGUpdates: desiredUpdates(r, inplaceUpdates, destructiveUpdates), } // Check if a rolling upgrade strategy is being used @@ -415,9 +420,9 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist // If we are annotating the plan, then decrement the desired // placements based on whether the node meets the constraints - if s.eval.AnnotatePlan && s.plan.Annotations != nil && - s.plan.Annotations.DesiredTGUpdates != nil { - desired := s.plan.Annotations.DesiredTGUpdates[tgName] + if s.planAnnotations != nil && + s.planAnnotations.DesiredTGUpdates != nil { + desired := s.planAnnotations.DesiredTGUpdates[tgName] desired.Place -= 1 } @@ -511,10 +516,10 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist s.plan.AppendPreemptedAlloc(stop, alloc.ID) preemptedAllocIDs = append(preemptedAllocIDs, stop.ID) - if s.eval.AnnotatePlan && s.plan.Annotations != nil { - s.plan.Annotations.PreemptedAllocs = append(s.plan.Annotations.PreemptedAllocs, stop.Stub(nil)) - if s.plan.Annotations.DesiredTGUpdates != nil { - desired := s.plan.Annotations.DesiredTGUpdates[tgName] + if s.eval.AnnotatePlan && s.planAnnotations != nil { + s.planAnnotations.PreemptedAllocs = append(s.planAnnotations.PreemptedAllocs, stop.Stub(nil)) + if s.planAnnotations.DesiredTGUpdates != nil { + desired := s.planAnnotations.DesiredTGUpdates[tgName] desired.Preemptions += 1 } } diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 44f2132e260..757c6a4eaee 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -53,8 +53,10 @@ func TestSystemSched_JobRegister(t *testing.T) { must.Len(t, 1, h.Plans) plan := h.Plans[0] - // Ensure the plan does not have annotations + // Ensure the plan does not have annotations but the eval does must.Nil(t, plan.Annotations, must.Sprint("expected no annotations")) + must.SliceNotEmpty(t, h.Evals) + must.Eq(t, 10, h.Evals[0].PlanAnnotations.DesiredTGUpdates["web"].Place) // Ensure the plan allocated var planned []*structs.Allocation diff --git a/scheduler/util.go b/scheduler/util.go index 0c3ec9e71d3..31e20ff0541 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -540,7 +540,9 @@ func renderTemplatesUpdated(a, b *structs.RestartPolicy, msg string) comparison // setStatus is used to update the status of the evaluation func setStatus(logger log.Logger, planner sstructs.Planner, eval, nextEval, spawnedBlocked *structs.Evaluation, - tgMetrics map[string]*structs.AllocMetric, status, desc string, + tgMetrics map[string]*structs.AllocMetric, + annotations *structs.PlanAnnotations, + status, desc string, queuedAllocs map[string]int, deploymentID string) error { logger.Debug("setting eval status", "status", status) @@ -558,6 +560,7 @@ func setStatus(logger log.Logger, planner sstructs.Planner, if queuedAllocs != nil { newEval.QueuedAllocations = queuedAllocs } + newEval.PlanAnnotations = annotations return planner.UpdateEval(newEval) } diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 63e1568a9d6..490fa35c396 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -597,7 +597,7 @@ func TestSetStatus(t *testing.T) { eval := mock.Eval() status := "a" desc := "b" - must.NoError(t, setStatus(logger, h, eval, nil, nil, nil, status, desc, nil, "")) + must.NoError(t, setStatus(logger, h, eval, nil, nil, nil, nil, status, desc, nil, "")) must.Eq(t, 1, len(h.Evals), must.Sprintf("setStatus() didn't update plan: %v", h.Evals)) newEval := h.Evals[0] @@ -607,7 +607,7 @@ func TestSetStatus(t *testing.T) { // Test next evals h = tests.NewHarness(t) next := mock.Eval() - must.NoError(t, setStatus(logger, h, eval, next, nil, nil, status, desc, nil, "")) + must.NoError(t, setStatus(logger, h, eval, next, nil, nil, nil, status, desc, nil, "")) must.Eq(t, 1, len(h.Evals), must.Sprintf("setStatus() didn't update plan: %v", h.Evals)) newEval = h.Evals[0] @@ -616,7 +616,7 @@ func TestSetStatus(t *testing.T) { // Test blocked evals h = tests.NewHarness(t) blocked := mock.Eval() - must.NoError(t, setStatus(logger, h, eval, nil, blocked, nil, status, desc, nil, "")) + must.NoError(t, setStatus(logger, h, eval, nil, blocked, nil, nil, status, desc, nil, "")) must.Eq(t, 1, len(h.Evals), must.Sprintf("setStatus() didn't update plan: %v", h.Evals)) newEval = h.Evals[0] @@ -625,7 +625,7 @@ func TestSetStatus(t *testing.T) { // Test metrics h = tests.NewHarness(t) metrics := map[string]*structs.AllocMetric{"foo": nil} - must.NoError(t, setStatus(logger, h, eval, nil, nil, metrics, status, desc, nil, "")) + must.NoError(t, setStatus(logger, h, eval, nil, nil, metrics, nil, status, desc, nil, "")) must.Eq(t, 1, len(h.Evals), must.Sprintf("setStatus() didn't update plan: %v", h.Evals)) newEval = h.Evals[0] @@ -636,15 +636,25 @@ func TestSetStatus(t *testing.T) { h = tests.NewHarness(t) queuedAllocs := map[string]int{"web": 1} - must.NoError(t, setStatus(logger, h, eval, nil, nil, metrics, status, desc, queuedAllocs, "")) + must.NoError(t, setStatus(logger, h, eval, nil, nil, metrics, nil, status, desc, queuedAllocs, "")) + must.Eq(t, 1, len(h.Evals), must.Sprintf("setStatus() didn't update plan: %v", h.Evals)) + + // Test annotations + h = tests.NewHarness(t) + annotations := &structs.PlanAnnotations{ + DesiredTGUpdates: map[string]*structs.DesiredUpdates{"web": {Place: 1}}, + PreemptedAllocs: []*structs.AllocListStub{{ID: uuid.Generate()}}, + } + + must.NoError(t, setStatus(logger, h, eval, nil, nil, metrics, annotations, status, desc, queuedAllocs, "")) must.Eq(t, 1, len(h.Evals), must.Sprintf("setStatus() didn't update plan: %v", h.Evals)) newEval = h.Evals[0] - must.Eq(t, newEval.QueuedAllocations, queuedAllocs, must.Sprintf("setStatus() didn't set failed task group metrics correctly: %v", newEval)) + must.Eq(t, annotations, newEval.PlanAnnotations, must.Sprintf("setStatus() didn't set plan annotations correctly: %v", newEval)) h = tests.NewHarness(t) dID := uuid.Generate() - must.NoError(t, setStatus(logger, h, eval, nil, nil, metrics, status, desc, queuedAllocs, dID)) + must.NoError(t, setStatus(logger, h, eval, nil, nil, metrics, nil, status, desc, queuedAllocs, dID)) must.Eq(t, 1, len(h.Evals), must.Sprintf("setStatus() didn't update plan: %v", h.Evals)) newEval = h.Evals[0]