From 21f8199b28aa8145ade402485ec25b97b3f55d9c Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 30 Jul 2024 15:34:23 -0400 Subject: [PATCH] drop autopilotExempt annotation Now that we have NoSchedule and NoExecute autopilot labels, we can get similar behavior without the risk of confusing the slack cluster queue lending limit adjustments with non-uniform AppWrapper behavior. --- api/v1beta2/appwrapper_types.go | 1 - .../controller/appwrapper/appwrapper_controller.go | 13 +------------ .../controller/appwrapper/resource_management.go | 2 +- site/_pages/arch-fault-tolerance.md | 8 -------- 4 files changed, 2 insertions(+), 22 deletions(-) diff --git a/api/v1beta2/appwrapper_types.go b/api/v1beta2/appwrapper_types.go index a177aa5..66d5a9e 100644 --- a/api/v1beta2/appwrapper_types.go +++ b/api/v1beta2/appwrapper_types.go @@ -174,7 +174,6 @@ const ( SuccessTTLAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration" TerminalExitCodesAnnotation = "workload.codeflare.dev.appwrapper/terminalExitCodes" RetryableExitCodesAnnotation = "workload.codeflare.dev.appwrapper/retryableExitCodes" - AutopilotExemptAnnotation = "workload.codeflare.dev.appwrapper/autopilotExempt" ) //+kubebuilder:object:root=true diff --git a/internal/controller/appwrapper/appwrapper_controller.go b/internal/controller/appwrapper/appwrapper_controller.go index e5bce1d..aba23f2 100644 --- a/internal/controller/appwrapper/appwrapper_controller.go +++ b/internal/controller/appwrapper/appwrapper_controller.go @@ -521,7 +521,7 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b return nil, err } summary := &podStatusSummary{expected: pc} - checkUnhealthyNodes := r.Config.Autopilot != nil && r.Config.Autopilot.MonitorNodes && !r.isAutopilotExempt(ctx, aw) + checkUnhealthyNodes := r.Config.Autopilot != nil && r.Config.Autopilot.MonitorNodes for _, pod := range pods.Items { switch pod.Status.Phase { @@ -866,17 +866,6 @@ func (r *AppWrapperReconciler) retryableExitCodes(_ context.Context, aw *workloa return ans } -func (r *AppWrapperReconciler) isAutopilotExempt(ctx context.Context, aw *workloadv1beta2.AppWrapper) bool { - if v, ok := aw.Annotations[workloadv1beta2.AutopilotExemptAnnotation]; ok { - if isExempt, err := strconv.ParseBool(v); err == nil { - return isExempt - } else { - log.FromContext(ctx).Error(err, "Malformed autopilotExempt annotation; treating as false", "annotation", v) - } - } - return false -} - func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) { if meta.IsStatusConditionTrue(aw.Status.Conditions, string(condition)) { meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ diff --git a/internal/controller/appwrapper/resource_management.go b/internal/controller/appwrapper/resource_management.go index 1597c03..b3b6562 100644 --- a/internal/controller/appwrapper/resource_management.go +++ b/internal/controller/appwrapper/resource_management.go @@ -252,7 +252,7 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload } } - if r.Config.Autopilot != nil && r.Config.Autopilot.InjectAntiAffinities && !r.isAutopilotExempt(ctx, aw) { + if r.Config.Autopilot != nil && r.Config.Autopilot.InjectAntiAffinities { toAdd := map[string][]string{} for resource, taints := range r.Config.Autopilot.ResourceTaints { if hasResourceRequest(spec, resource) { diff --git a/site/_pages/arch-fault-tolerance.md b/site/_pages/arch-fault-tolerance.md index 60cef4f..84ce13d 100644 --- a/site/_pages/arch-fault-tolerance.md +++ b/site/_pages/arch-fault-tolerance.md @@ -127,13 +127,6 @@ begins. Since the AppWrapper continues to consume quota during this delayed dele this annotation should be used sparingly and only when interactive debugging of the failed workload is being actively pursued. -An AppWrapper can be annotated as `autopilotExempt` to disable the -injection of Autopilot Node anti-affinities into its Pods and the -automatic migration of its Pods away from Nodes with Autopilot tagged -unhealthy resources. This annotation should only be used for workloads -that will be closely monitored by other means to identify and recover from -unhealthy Nodes in the cluster. - All child resources for an AppWrapper that successfully completed will be automatically deleted after a `SuccessTTL` after the AppWrapper entered the `Succeeded` state. @@ -154,7 +147,6 @@ can be used to customize them. | DeletionOnFailureGracePeriod | 0 Seconds | workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration | | ForcefulDeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration | | SuccessTTL | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration | -| AutopilotExempt | false | workload.codeflare.dev.appwrapper/autopilotExempt | | GracePeriodMaximum | 24 Hours | Not Applicable | The `GracePeriodMaximum` imposes a system-wide upper limit on all other grace periods to