From e40dff312e56b56b3d8889ebbced85153f9a0c05 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Fri, 5 Sep 2025 15:39:20 -0400 Subject: [PATCH 1/2] NO-JIRA: New rules about CO's Degraded and Available conditions The essence of the new rules is that operators MUST not go Available=False or Degraded=True in an HA cluster during an uneventful CI upgrade. Those rules have applied in CI for a while [1, 2] and OCPBugs have been filed in this area. In order to avoid CI failing, many exceptions have been added in the tests [3, 4] as many of those bugs are still open. It is expected to invest effort to deliver the fixes of those bugs. [1]. https://issues.redhat.com/browse/OTA-700 [2]. https://issues.redhat.com/browse/TRT-1578 [3]. https://github.com/openshift/origin/blob/2af38a7807699b3046a73f931884152a11271d21/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go#L102 [4]. https://github.com/openshift/origin/pull/27231 --- config/v1/types_cluster_operator.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/config/v1/types_cluster_operator.go b/config/v1/types_cluster_operator.go index a447adb9f4a..077927af0a9 100644 --- a/config/v1/types_cluster_operator.go +++ b/config/v1/types_cluster_operator.go @@ -150,13 +150,14 @@ type ClusterOperatorStatusCondition struct { type ClusterStatusConditionType string const ( - // Available indicates that the component (operator and all configured operands) + // OperatorAvailable indicates that the component (operator and all configured operands) // is functional and available in the cluster. Available=False means at least // part of the component is non-functional, and that the condition requires // immediate administrator intervention. + // A component must not report unavailable during the course of a normal upgrade except it is a single-node cluster. OperatorAvailable ClusterStatusConditionType = "Available" - // Progressing indicates that the component (operator and all configured operands) + // OperatorProgressing indicates that the component (operator and all configured operands) // is actively rolling out new code, propagating config changes, or otherwise // moving from one steady state to another. Operators should not report // progressing when they are reconciling (without action) a previously known @@ -165,17 +166,18 @@ const ( // since it is moving from one steady state to another. OperatorProgressing ClusterStatusConditionType = "Progressing" - // Degraded indicates that the component (operator and all configured operands) + // OperatorDegraded indicates that the component (operator and all configured operands) // does not match its desired state over a period of time resulting in a lower // quality of service. The period of time may vary by component, but a Degraded - // state represents persistent observation of a condition. As a result, a + // state represents persistent observation of a condition, and it may require + // immediate administrator intervention. As a result, a // component should not oscillate in and out of Degraded state. A component may // be Available even if its degraded. For example, a component may desire 3 // running pods, but 1 pod is crash-looping. The component is Available but // Degraded because it may have a lower quality of service. A component may be // Progressing but not Degraded because the transition from one state to // another does not persist over a long enough period to report Degraded. A - // component should not report Degraded during the course of a normal upgrade. + // component must not report Degraded during the course of a normal upgrade except it is a single-node cluster. // A component may report Degraded in response to a persistent infrastructure // failure that requires eventual administrator intervention. For example, if // a control plane host is unhealthy and must be replaced. A component should @@ -183,7 +185,7 @@ const ( // expectation is that all unexpected errors are handled as operators mature. OperatorDegraded ClusterStatusConditionType = "Degraded" - // Upgradeable indicates whether the component (operator and all configured + // OperatorUpgradeable indicates whether the component (operator and all configured // operands) is safe to upgrade based on the current cluster state. When // Upgradeable is False, the cluster-version operator will prevent the // cluster from performing impacted updates unless forced. When set on From e39501a155cf85e26e648d0a4aac65ff636e5b1a Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Fri, 5 Sep 2025 16:05:17 -0400 Subject: [PATCH 2/2] New rules about CO's Progressing condition The essence of the new rule is that operators MUST complete their upgrade within 30 minutes in a cluster up to 250 nodes in size, except for Machine Config Operator which has 90 minutes. This formalizes the changes introduced from cluster-version-operator#1165 where CVO begins complaining (Failing=Unknown) whenever an operator takes longer to upgrade than the given time. [1]. https://github.com/openshift/cluster-version-operator/pull/1165 --- config/v1/types_cluster_operator.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/v1/types_cluster_operator.go b/config/v1/types_cluster_operator.go index 077927af0a9..e9225a370a5 100644 --- a/config/v1/types_cluster_operator.go +++ b/config/v1/types_cluster_operator.go @@ -158,12 +158,15 @@ const ( OperatorAvailable ClusterStatusConditionType = "Available" // OperatorProgressing indicates that the component (operator and all configured operands) - // is actively rolling out new code, propagating config changes, or otherwise + // is actively rolling out new code, propagating config changes (e.g, a version change), or otherwise // moving from one steady state to another. Operators should not report // progressing when they are reconciling (without action) a previously known // state. If the observed cluster state has changed and the component is // reacting to it (scaling up for instance), Progressing should become true // since it is moving from one steady state to another. + // A component in a cluster with less than 250 nodes must complete a version + // change within a limited period of time: 90 minutes for Machine Config Operator and 30 minutes for others. + // Machine Config Operator is given more time as it needs to restart control planes. OperatorProgressing ClusterStatusConditionType = "Progressing" // OperatorDegraded indicates that the component (operator and all configured operands)