openshift
diff --git a/‎go.mod
Lines changed: 2 additions & 0 deletions b/‎go.mod
Lines changed: 2 additions & 0 deletions
diff --git a/‎go.sum
Lines changed: 2 additions & 2 deletions b/‎go.sum
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/cmd/render/render.go
Lines changed: 29 additions & 9 deletions b/‎pkg/cmd/render/render.go
Lines changed: 29 additions & 9 deletions
diff --git a/‎pkg/cmd/render/render_test.go
Lines changed: 1 addition & 1 deletion b/‎pkg/cmd/render/render_test.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/etcdcli/health.go
Lines changed: 10 additions & 0 deletions b/‎pkg/etcdcli/health.go
Lines changed: 10 additions & 0 deletions
diff --git a/‎pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go
Lines changed: 2 additions & 2 deletions b/‎pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/operator/ceohelpers/bootstrap.go
Lines changed: 55 additions & 10 deletions b/‎pkg/operator/ceohelpers/bootstrap.go
Lines changed: 55 additions & 10 deletions
@@ -146,3 +146,5 @@ replace (
 )
 
 replace github.com/openshift/library-go => github.com/benluddy/library-go v0.0.0-20250129150747-314ad28512db
+
+replace github.com/openshift/api => github.com/eggfoobar/api v0.0.0-20250207054050-9a92c12ec7ba
@@ -91,6 +91,8 @@ github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/eggfoobar/api v0.0.0-20250207054050-9a92c12ec7ba h1:tzadVk7yPlwIK3s16JrjxrT/JgF2tGRS8A+nKllh+aQ=
+github.com/eggfoobar/api v0.0.0-20250207054050-9a92c12ec7ba/go.mod h1:yk60tHAmHhtVpJQo3TwVYq2zpuP70iJIFDCmeKMIzPw=
 github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
 github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
@@ -321,8 +323,6 @@ github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM
 github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
 github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4=
 github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
-github.com/openshift/api v0.0.0-20250124212313-a770960d61e0 h1:dCvNfygMrPLVNQ06bpHXrxKfrXHiprO4+etHrRUqI8g=
-github.com/openshift/api v0.0.0-20250124212313-a770960d61e0/go.mod h1:yk60tHAmHhtVpJQo3TwVYq2zpuP70iJIFDCmeKMIzPw=
 github.com/openshift/build-machinery-go v0.0.0-20250102153059-e85a1a7ecb5c h1:6XcszPFZpan4qll5XbdLll7n1So3IsPn28aw2j1obMo=
 github.com/openshift/build-machinery-go v0.0.0-20250102153059-e85a1a7ecb5c/go.mod h1:8jcm8UPtg2mCAsxfqKil1xrmRMI3a+XU2TZ9fF8A7TE=
 github.com/openshift/client-go v0.0.0-20250125113824-8e1f0b8fa9a7 h1:4iliLcvr1P9EUMZgIaSNEKNQQzBn+L6PSequlFOuB6Q=
 
@@ -256,9 +256,10 @@ func newTemplateData(opts *renderOpts) (*TemplateData, error) {
 	}
 
 	// If bootstrap scaling strategy is delayed HA set annotation signal
-	if templateData.BootstrapScalingStrategy == ceohelpers.DelayedHAScalingStrategy {
+	if templateData.BootstrapScalingStrategy == ceohelpers.DelayedHAScalingStrategy ||
+		templateData.BootstrapScalingStrategy == ceohelpers.DelayedTwoNodeScalingStrategy {
 		templateData.NamespaceAnnotations = map[string]string{
-			ceohelpers.DelayedHABootstrapScalingStrategyAnnotation: "",
+			ceohelpers.DelayedBootstrapScalingStrategyAnnotation: "",
 		}
 	}
 
@@ -721,26 +722,45 @@ func getInfrastructure(file string) (*configv1.Infrastructure, error) {
 }
 
 func getBootstrapScalingStrategy(installConfig map[string]interface{}, delayedHAMarkerFile string) (ceohelpers.BootstrapScalingStrategy, error) {
-	// Delayed HA strategy is set if marker file exists on disk.
-	if _, err := os.Stat(delayedHAMarkerFile); err == nil {
-		return ceohelpers.DelayedHAScalingStrategy, nil
-	}
-
 	controlPlane, found := installConfig["controlPlane"].(map[string]interface{})
 	if !found {
 		return "", fmt.Errorf("unrecognized data structure in controlPlane field")
 	}
-	replicaCount, found := controlPlane["replicas"].(float64)
+	cpReplicaCount, found := controlPlane["replicas"].(float64)
 	if !found {
 		return "", fmt.Errorf("unrecognized data structure in controlPlane replica field")
 	}
 
 	// Bootstrap in place strategy when bootstrapInPlace root key exists in the install-config
 	// and controlPlane replicas is 1.
-	if _, found := installConfig["bootstrapInPlace"]; found && int(replicaCount) == 1 {
+	if _, found := installConfig["bootstrapInPlace"]; found && int(cpReplicaCount) == 1 {
 		return ceohelpers.BootstrapInPlaceStrategy, nil
 	}
 
+	// Delayed HA strategy is set if marker file exists on disk.
+	if _, err := os.Stat(delayedHAMarkerFile); err == nil {
+
+		// Handle two-node topologies; if an arbiter node is not present
+		// then use DelayedTwoNodeScalingStrategy to allow bootstrap to proceed
+		// without a third etcd member
+		if int(cpReplicaCount) == 2 {
+			arbiter, arbiterDefined := installConfig["aribiter"].(map[string]interface{})
+			if !arbiterDefined {
+				return ceohelpers.DelayedTwoNodeScalingStrategy, nil
+			}
+
+			arbReplicaCount, arbReplicasDefined := arbiter["replicas"].(float64)
+			if !arbReplicasDefined || arbReplicaCount < 1 {
+				return ceohelpers.DelayedTwoNodeScalingStrategy, nil
+			}
+		}
+
+		// TODO check for SNO here?
+
+		// This should handle both delayed two-node with arbiter and delayed HA clusters
+		return ceohelpers.DelayedHAScalingStrategy, nil
+	}
+
 	// HA "default".
 	return ceohelpers.HAScalingStrategy, nil
 }
@@ -298,7 +298,7 @@ func TestRenderScalingStrategyBootstrapInPlace(t *testing.T) {
 func TestRenderScalingStrategyDelayedHA(t *testing.T) {
 	want := TemplateData{
 		BootstrapScalingStrategy: ceohelpers.DelayedHAScalingStrategy,
-		NamespaceAnnotations:     map[string]string{ceohelpers.DelayedHABootstrapScalingStrategyAnnotation: ""},
+		NamespaceAnnotations:     map[string]string{ceohelpers.DelayedBootstrapScalingStrategyAnnotation: ""},
 	}
 	config := &testConfig{
 		t:                                       t,
 
@@ -227,6 +227,16 @@ func IsQuorumFaultTolerant(memberHealth []healthCheck) bool {
 	}
 	healthyMembers := len(GetHealthyMemberNames(memberHealth))
 	switch {
+	// This case should never occur when this function is called by CheckSafeToScaleCluster
+	// since this function is never called for the UnsafeScalingStrategy (which covers Single Node OpenShift)
+	// and the TwoNodeScalingStrategy and DelayedTwoNodeScalingStrategy when the cluster has two etcd members
+	// which is a special expection we make for TwoNodeOpenShift with fencing.
+	//
+	// It is also never triggered by the HAScalingStrategy and DelayedHAScalingStrategy because having less
+	// than 3 healthy nodes violates these scaling strategies, which is checked before this function is called.
+	//
+	// The reason this is here is to ensure protection against 1 and 2 node membership if ever this function
+	// is called directly.
 	case totalMembers-quorum < 1:
 		klog.Errorf("etcd cluster has quorum of %d which is not fault tolerant: %+v", quorum, memberHealth)
 		return false
 
@@ -194,11 +194,11 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context
 		if len(members) < 4 {
 			return false, hasBootstrap, bootstrapMemberID, nil
 		}
-	case ceohelpers.DelayedHAScalingStrategy:
+	case ceohelpers.DelayedHAScalingStrategy, ceohelpers.TwoNodeScalingStrategy:
 		if len(members) < 3 {
 			return false, hasBootstrap, bootstrapMemberID, nil
 		}
-	case ceohelpers.UnsafeScalingStrategy:
+	case ceohelpers.UnsafeScalingStrategy, ceohelpers.DelayedTwoNodeScalingStrategy:
 		if len(members) < 2 {
 			return false, hasBootstrap, bootstrapMemberID, nil
 		}
 
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 
+	configv1 "github.com/openshift/api/config/v1"
 	configv1listers "github.com/openshift/client-go/config/listers/config/v1"
 	"github.com/openshift/library-go/pkg/operator/bootstrap"
 	"github.com/openshift/library-go/pkg/operator/v1helpers"
@@ -35,6 +36,22 @@ const (
 	// annotation to the openshift-etcd namesapce.
 	DelayedHAScalingStrategy BootstrapScalingStrategy = "DelayedHAScalingStrategy"
 
+	// TwoNodeScalingStrategy means the etcd cluster will only be scaled up when at least
+	// 2 nodes are available so that quorum is maintained at all times. This rule applies
+	// during bootstrapping and the steady state.
+	//
+	// This strategy is used for deployments of Two Node OpenShift with Fencing.
+	TwoNodeScalingStrategy BootstrapScalingStrategy = "TwoNodeScalingStrategy"
+
+	// DelayedTwoNodeScalingStrategy means that during bootstrapping, the etcd cluster will
+	// be allowed to scale when at least 1 member is available (which is unsafe),
+	// but after bootstrapping any further scaling will require 2 nodes in the same
+	// way as TwoNodeScalingStrategy.
+	//
+	// This strategy is intended for deploys of Two Node OpenShift with Fencing via
+	// the assisted or agent-based installers.
+	DelayedTwoNodeScalingStrategy BootstrapScalingStrategy = "DelayedTwoNodeScalingStrategy"
+
 	// BootstrapInPlaceStrategy means that the bootstrap node will never exist
 	// during the lifecycle of the cluster. Bootkube will run on a live iso
 	// afterwards the node will pivot into the manifests generated during that
@@ -54,9 +71,18 @@ const (
 )
 
 const (
-	// DelayedHABootstrapScalingStrategyAnnotation is an annotation on the openshift-etcd
-	// namespace which, if present indicates the DelayedHAScalingStrategy strategy
-	// should be used.
+	// DelayedBootstrapScalingStrategyAnnotation is an annotation on the openshift-etcd
+	// namespace which, if present, indicates that one of the delayed scaling strategies
+	// should be used. This is generally used by the assisted installer to ensure that
+	// the bootstrap node can reboot into a cluster node.
+	//
+	// For HA clusters, this will be set to DelayedHAScalingStrategy.
+	//
+	// For Two Node OpenShift with Fencing, this is set to DelayedTwoNodeScalingStrategy.
+	DelayedBootstrapScalingStrategyAnnotation = "openshift.io/delayed-bootstrap"
+
+	// DelayedHABootstrapScalingStrategyAnnotation performs the same function as the annotation
+	// above, and is kept for backwards compatibility.
 	DelayedHABootstrapScalingStrategyAnnotation = "openshift.io/delayed-ha-bootstrap"
 )
 
@@ -78,17 +104,25 @@ func GetBootstrapScalingStrategy(staticPodClient v1helpers.StaticPodOperatorClie
 	if err != nil {
 		return strategy, fmt.Errorf("failed to get %s namespace: %w", operatorclient.TargetNamespace, err)
 	}
+
+	// Check for both the delayed annotation and the legacy DelayedHABootrapScalingStrategyAnnotation
+	_, hasDelayedAnnotation := etcdNamespace.Annotations[DelayedBootstrapScalingStrategyAnnotation]
 	_, hasDelayedHAAnnotation := etcdNamespace.Annotations[DelayedHABootstrapScalingStrategyAnnotation]
+	hasDelayedAnnotation = hasDelayedAnnotation || hasDelayedHAAnnotation
 
-	singleNode, err := IsSingleNodeTopology(infraLister)
+	topology, err := GetControlPlaneTopology(infraLister)
 	if err != nil {
 		return strategy, fmt.Errorf("failed to get control plane topology: %w", err)
 	}
 
 	switch {
-	case isUnsupportedUnsafeEtcd || singleNode:
+	case isUnsupportedUnsafeEtcd || topology == configv1.SingleReplicaTopologyMode:
 		strategy = UnsafeScalingStrategy
-	case hasDelayedHAAnnotation:
+	case topology == configv1.DualReplicaTopologyMode && hasDelayedAnnotation:
+		strategy = DelayedTwoNodeScalingStrategy
+	case topology == configv1.DualReplicaTopologyMode && !hasDelayedAnnotation:
+		strategy = TwoNodeScalingStrategy
+	case hasDelayedAnnotation:
 		strategy = DelayedHAScalingStrategy
 	default:
 		strategy = HAScalingStrategy
@@ -126,10 +160,10 @@ func CheckSafeToScaleCluster(
 
 	var minimumNodes int
 	switch scalingStrategy {
-	case HAScalingStrategy:
-		minimumNodes = 3
-	case DelayedHAScalingStrategy:
+	case HAScalingStrategy, DelayedHAScalingStrategy:
 		minimumNodes = 3
+	case TwoNodeScalingStrategy, DelayedTwoNodeScalingStrategy:
+		minimumNodes = 2
 	default:
 		return fmt.Errorf("CheckSafeToScaleCluster unrecognized scaling strategy %q", scalingStrategy)
 	}
@@ -139,8 +173,19 @@ func CheckSafeToScaleCluster(
 		return fmt.Errorf("CheckSafeToScaleCluster couldn't determine member health: %w", err)
 	}
 
+	if len(memberHealth.GetHealthyMembers()) < minimumNodes {
+		return fmt.Errorf("CheckSafeToScaleCluster found %d healthy member(s) out of the %d required by the %s",
+			len(memberHealth.GetHealthyMembers()), minimumNodes, scalingStrategy)
+	}
+
+	// Fault tolerance protection is only enforced by for HA topologies
+	//
+	// TwoNodeScalingStrategy and DelayedTwoNodeScalingStrategy are used by Two Node OpenShift with
+	// Fencing (TNF), which protects etcd using a service called pacemaker that is running on the nodes.
+	// This service will intercept the static pod rollout, have that member of etcd leave the cluster,
+	// restart the static pod with the updates, and have it rejoin the cluster as a learner
 	err = etcdcli.IsQuorumFaultTolerantErr(memberHealth)
-	if err != nil {
+	if err != nil && len(memberHealth) != 2 && !(scalingStrategy == TwoNodeScalingStrategy || scalingStrategy == DelayedTwoNodeScalingStrategy) {
 		return err
 	}
Original file line number	Diff line number	Diff line change
`@@ -146,3 +146,5 @@ replace (`
`146`	`146`	`)`
`147`	`147`
`148`	`148`	`replace github.com/openshift/library-go => github.com/benluddy/library-go v0.0.0-20250129150747-314ad28512db`
	`149`	`+`
	`150`	`+replace github.com/openshift/api => github.com/eggfoobar/api v0.0.0-20250207054050-9a92c12ec7ba`
Original file line number	Diff line number	Diff line change
`@@ -298,7 +298,7 @@ func TestRenderScalingStrategyBootstrapInPlace(t *testing.T) {`
`298`	`298`	`func TestRenderScalingStrategyDelayedHA(t *testing.T) {`
`299`	`299`	`want := TemplateData{`
`300`	`300`	`BootstrapScalingStrategy: ceohelpers.DelayedHAScalingStrategy,`
`301`		`- NamespaceAnnotations: map[string]string{ceohelpers.DelayedHABootstrapScalingStrategyAnnotation: ""},`
	`301`	`+ NamespaceAnnotations: map[string]string{ceohelpers.DelayedBootstrapScalingStrategyAnnotation: ""},`
`302`	`302`	`}`
`303`	`303`	`config := &testConfig{`
`304`	`304`	`t: t,`
Original file line number	Diff line number	Diff line change
`@@ -194,11 +194,11 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context`
`194`	`194`	`if len(members) < 4 {`
`195`	`195`	`return false, hasBootstrap, bootstrapMemberID, nil`
`196`	`196`	`}`
`197`		`- case ceohelpers.DelayedHAScalingStrategy:`
	`197`	`+ case ceohelpers.DelayedHAScalingStrategy, ceohelpers.TwoNodeScalingStrategy:`
`198`	`198`	`if len(members) < 3 {`
`199`	`199`	`return false, hasBootstrap, bootstrapMemberID, nil`
`200`	`200`	`}`
`201`		`- case ceohelpers.UnsafeScalingStrategy:`
	`201`	`+ case ceohelpers.UnsafeScalingStrategy, ceohelpers.DelayedTwoNodeScalingStrategy:`
`202`	`202`	`if len(members) < 2 {`
`203`	`203`	`return false, hasBootstrap, bootstrapMemberID, nil`
`204`	`204`	`}`