Skip to content

Commit 3729e59

Browse files
authored
Implement lease-based certificate renewal (#265)
* Update Fabric components to version 3.1.0 and implement lease-based certificate renewal - Bumped the PEER and ORDERER versions to 3.1.0 in the README. - Added a new boolean field `certRenewalLeaseHeld` to the FabricOrdererNodeStatus for tracking lease status. - Implemented lease acquisition and release logic in the ordnode_controller for managing certificate renewal, enhancing the reliability of the renewal process. - Introduced a new utility for handling Kubernetes leases to facilitate distributed locking during certificate updates. These changes improve the management of certificate renewals and ensure smoother operations for Fabric orderer nodes. Signed-off-by: David VIEJO <[email protected]> * Refactor certificate renewal logic in Fabric controllers - Removed redundant deployment restart logic from the `updateCerts` method in both `ordnode_controller` and `peer_controller`. - Implemented lease-based locking for certificate renewal in the `peer_controller`, enhancing the reliability of the renewal process. - Added a new field `CertRenewalLeaseHeld` to the `FabricPeerStatus` struct to track the lease status during certificate updates. These changes streamline the certificate renewal process and improve the overall management of Fabric components. Signed-off-by: David VIEJO <[email protected]> --------- Signed-off-by: David VIEJO <[email protected]>
1 parent b5f0c3a commit 3729e59

File tree

6 files changed

+207
-48
lines changed

6 files changed

+207
-48
lines changed

config/crd/bases/hlf.kungfusoftware.es_fabricorderernodes.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1359,6 +1359,8 @@ spec:
13591359
properties:
13601360
adminPort:
13611361
type: integer
1362+
certRenewalLeaseHeld:
1363+
type: boolean
13621364
conditions:
13631365
items:
13641366
properties:

config/crd/bases/hlf.kungfusoftware.es_fabricpeers.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2485,6 +2485,8 @@ spec:
24852485
type: object
24862486
status:
24872487
properties:
2488+
certRenewalLeaseHeld:
2489+
type: boolean
24882490
conditions:
24892491
items:
24902492
properties:

controllers/ordnode/ordnode_controller.go

Lines changed: 62 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -227,13 +227,58 @@ func (r *FabricOrdererNodeReconciler) Reconcile(ctx context.Context, req ctrl.Re
227227
}
228228
requeueAfter := time.Second * 10
229229
log.Infof("Last time certs were updated: %v, they need to be renewed: %v", lastTimeCertsRenewed, certificatesNeedToBeRenewed)
230+
231+
// --- RELEASE LEASE IF HELD AND STATUS IS RUNNING ---
232+
if fabricOrdererNode.Status.CertRenewalLeaseHeld && fabricOrdererNode.Status.Status == hlfv1alpha1.RunningStatus {
233+
leaseName := "orderernode-cert-renewal-global-lock"
234+
holderIdentity := os.Getenv("POD_NAME")
235+
if holderIdentity == "" {
236+
holderIdentity = fmt.Sprintf("orderernode-%s-lock", fabricOrdererNode.Name)
237+
}
238+
err := utils.ReleaseLease(ctx, clientSet, leaseName, ns, holderIdentity)
239+
if err != nil {
240+
log.Warnf("Error releasing lease: %v", err)
241+
} else {
242+
log.Infof("Released cert renewal lease for %s", fabricOrdererNode.Name)
243+
}
244+
fabricOrdererNode.Status.CertRenewalLeaseHeld = false
245+
if err := r.Status().Update(ctx, fabricOrdererNode); err != nil {
246+
log.Errorf("Error updating status after releasing lease: %v", err)
247+
}
248+
}
249+
230250
if certificatesNeedToBeRenewed {
251+
// Lease-based lock for cert renewal (global lock)
252+
leaseName := "orderernode-cert-renewal-global-lock"
253+
holderIdentity := os.Getenv("POD_NAME")
254+
if holderIdentity == "" {
255+
holderIdentity = fmt.Sprintf("orderernode-%s-lock", fabricOrdererNode.Name)
256+
}
257+
leaseTTL := int32(120)
258+
acquired := false
259+
for i := 0; i < 5; i++ { // try for ~5 seconds
260+
ok, err := utils.AcquireLease(ctx, clientSet, leaseName, ns, holderIdentity, leaseTTL)
261+
if err != nil {
262+
log.Warnf("Error acquiring lease: %v", err)
263+
}
264+
if ok {
265+
acquired = true
266+
break
267+
}
268+
time.Sleep(time.Second)
269+
}
270+
if !acquired {
271+
log.Warnf("Could not acquire cert renewal lock for %s, skipping renewal", fabricOrdererNode.Name)
272+
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
273+
}
274+
// Set lease held flag
275+
fabricOrdererNode.Status.CertRenewalLeaseHeld = true
276+
if err := r.Status().Update(ctx, fabricOrdererNode); err != nil {
277+
log.Errorf("Error updating status after acquiring lease: %v", err)
278+
}
231279
// must update the certificates and block until it's done
232-
// scale down to zero replicas
233-
// wait for the deployment to scale down
234-
// update the certs
235-
// scale up the peer
236-
log.Infof("Trying to upgrade certs")
280+
log.Infof("Trying to upgrade certs (lease acquired)")
281+
r.setConditionStatus(ctx, fabricOrdererNode, hlfv1alpha1.UpdatingCertificates, false, nil, false)
237282
err := r.updateCerts(req, fabricOrdererNode, clientSet, releaseName, ctx, cfg, ns)
238283
if err != nil {
239284
log.Errorf("Error renewing certs: %v", err)
@@ -249,7 +294,7 @@ func (r *FabricOrdererNodeReconciler) Reconcile(ctx context.Context, req ctrl.Re
249294
if err != nil {
250295
return ctrl.Result{}, err
251296
}
252-
err = r.upgradeChart(cfg, err, ns, releaseName, c)
297+
err = r.upgradeChartWithWait(cfg, err, ns, releaseName, c, false, 5*time.Minute)
253298
if err != nil {
254299
r.setConditionStatus(ctx, fabricOrdererNode, hlfv1alpha1.FailedStatus, false, err, false)
255300
return r.updateCRStatusOrFailReconcile(ctx, r.Log, fabricOrdererNode)
@@ -451,35 +496,25 @@ func (r *FabricOrdererNodeReconciler) updateCerts(req ctrl.Request, node *hlfv1a
451496
log.Errorf("Error getting the config: %v", err)
452497
return errors.Wrapf(err, "Error getting the config: %v", err)
453498
}
454-
//config.Replicas = 0
455-
err = r.upgradeChart(cfg, err, ns, releaseName, config)
499+
// Force Wait=true and Timeout=5m for cert renewal
500+
wait := true
501+
timeout := 5 * time.Minute
502+
err = r.upgradeChartWithWait(cfg, err, ns, releaseName, config, wait, timeout)
456503
if err != nil {
457504
return errors.Wrapf(err, "Error upgrading the chart: %v", err)
458505
}
459-
dep, err := GetOrdererDeployment(
460-
cfg,
461-
r.Config,
462-
releaseName,
463-
req.Namespace,
464-
)
465-
if err != nil {
466-
return errors.Wrapf(err, "Error getting the deployment: %v", err)
467-
}
468-
err = restartDeployment(
469-
r.Config,
470-
dep,
471-
)
472-
if err != nil {
473-
return errors.Wrapf(err, "Error restarting the deployment: %v", err)
474-
}
475506
return nil
476507
}
477-
func (r *FabricOrdererNodeReconciler) upgradeChart(
508+
509+
// upgradeChartWithWait is like upgradeChart but allows overriding Wait/Timeout
510+
func (r *FabricOrdererNodeReconciler) upgradeChartWithWait(
478511
cfg *action.Configuration,
479512
err error,
480513
ns string,
481514
releaseName string,
482515
c *fabricOrdChart,
516+
wait bool,
517+
timeout time.Duration,
483518
) error {
484519
inrec, err := json.Marshal(c)
485520
if err != nil {
@@ -504,8 +539,8 @@ func (r *FabricOrdererNodeReconciler) upgradeChart(
504539
if err != nil {
505540
return err
506541
}
507-
cmd.Wait = r.Wait
508-
cmd.Timeout = r.Timeout
542+
cmd.Wait = wait
543+
cmd.Timeout = timeout
509544
cmd.MaxHistory = r.MaxHistory
510545

511546
release, err := cmd.Run(releaseName, ch, inInterface)

controllers/peer/peer_controller.go

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -454,13 +454,58 @@ func (r *FabricPeerReconciler) Reconcile(ctx context.Context, req ctrl.Request)
454454
}
455455
requeueAfter := time.Second * 10
456456
log.Infof("Peer: Last time certs were updated: %v, they need to be renewed: %v", lastTimeCertsRenewed, certificatesNeedToBeRenewed)
457+
458+
// --- RELEASE LEASE IF HELD AND STATUS IS RUNNING ---
459+
if fabricPeer.Status.CertRenewalLeaseHeld && fabricPeer.Status.Status == hlfv1alpha1.RunningStatus {
460+
leaseName := "peer-cert-renewal-global-lock"
461+
holderIdentity := os.Getenv("POD_NAME")
462+
if holderIdentity == "" {
463+
holderIdentity = fmt.Sprintf("peer-%s-lock", fabricPeer.Name)
464+
}
465+
err := utils.ReleaseLease(ctx, clientSet, leaseName, ns, holderIdentity)
466+
if err != nil {
467+
log.Warnf("Error releasing lease: %v", err)
468+
} else {
469+
log.Infof("Released cert renewal lease for %s", fabricPeer.Name)
470+
}
471+
fabricPeer.Status.CertRenewalLeaseHeld = false
472+
if err := r.Status().Update(ctx, fabricPeer); err != nil {
473+
log.Errorf("Error updating status after releasing lease: %v", err)
474+
}
475+
}
476+
457477
if certificatesNeedToBeRenewed {
478+
// Lease-based lock for cert renewal (global lock)
479+
leaseName := "peer-cert-renewal-global-lock"
480+
holderIdentity := os.Getenv("POD_NAME")
481+
if holderIdentity == "" {
482+
holderIdentity = fmt.Sprintf("peer-%s-lock", fabricPeer.Name)
483+
}
484+
leaseTTL := int32(120)
485+
acquired := false
486+
for i := 0; i < 5; i++ { // try for ~5 seconds
487+
ok, err := utils.AcquireLease(ctx, clientSet, leaseName, ns, holderIdentity, leaseTTL)
488+
if err != nil {
489+
log.Warnf("Error acquiring lease: %v", err)
490+
}
491+
if ok {
492+
acquired = true
493+
break
494+
}
495+
time.Sleep(time.Second)
496+
}
497+
if !acquired {
498+
log.Warnf("Could not acquire cert renewal lock for %s, skipping renewal", fabricPeer.Name)
499+
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
500+
}
501+
// Set lease held flag
502+
fabricPeer.Status.CertRenewalLeaseHeld = true
503+
if err := r.Status().Update(ctx, fabricPeer); err != nil {
504+
log.Errorf("Error updating status after acquiring lease: %v", err)
505+
}
458506
// must update the certificates and block until it's done
459-
// scale down to zero replicas
460-
// wait for the deployment to scale down
461-
// update the certs
462-
// scale up the peer
463-
log.Infof("Trying to upgrade certs")
507+
log.Infof("Trying to upgrade certs (lease acquired)")
508+
r.setConditionStatus(ctx, fabricPeer, hlfv1alpha1.UpdatingCertificates, false, nil, false)
464509
err := r.updateCerts(req, fabricPeer, clientSet, releaseName, svc, ctx, cfg, ns)
465510
if err != nil {
466511
log.Errorf("Error renewing certs: %v", err)
@@ -617,22 +662,6 @@ func (r *FabricPeerReconciler) updateCerts(req ctrl.Request, fPeer *hlfv1alpha1.
617662
if err != nil {
618663
return err
619664
}
620-
dep, err := GetPeerDeployment(
621-
cfg,
622-
r.Config,
623-
releaseName,
624-
req.Namespace,
625-
)
626-
if err != nil {
627-
return err
628-
}
629-
err = restartDeployment(
630-
r.Config,
631-
dep,
632-
)
633-
if err != nil {
634-
return err
635-
}
636665
return nil
637666
}
638667

controllers/utils/lease.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
package utils
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"time"
7+
8+
coordinationv1 "k8s.io/api/coordination/v1"
9+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10+
"k8s.io/client-go/kubernetes"
11+
)
12+
13+
// AcquireLease tries to acquire a Lease for distributed locking. Returns true if lock acquired, false if not.
14+
func AcquireLease(ctx context.Context, clientset *kubernetes.Clientset, leaseName, namespace, holderIdentity string, ttlSeconds int32) (bool, error) {
15+
leases := clientset.CoordinationV1().Leases(namespace)
16+
lease, err := leases.Get(ctx, leaseName, metav1.GetOptions{})
17+
if err != nil {
18+
// If not found, create it
19+
lease = &coordinationv1.Lease{
20+
ObjectMeta: metav1.ObjectMeta{
21+
Name: leaseName,
22+
Namespace: namespace,
23+
},
24+
Spec: coordinationv1.LeaseSpec{
25+
HolderIdentity: &holderIdentity,
26+
AcquireTime: &metav1.MicroTime{Time: time.Now()},
27+
RenewTime: &metav1.MicroTime{Time: time.Now()},
28+
LeaseDurationSeconds: &ttlSeconds,
29+
},
30+
}
31+
_, err := leases.Create(ctx, lease, metav1.CreateOptions{})
32+
if err != nil {
33+
return false, fmt.Errorf("failed to create lease: %w", err)
34+
}
35+
return true, nil
36+
}
37+
// If Lease exists, check if expired or held by us
38+
if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity == "" || leaseExpired(lease) {
39+
lease.Spec.HolderIdentity = &holderIdentity
40+
now := metav1.MicroTime{Time: time.Now()}
41+
lease.Spec.AcquireTime = &now
42+
lease.Spec.RenewTime = &now
43+
lease.Spec.LeaseDurationSeconds = &ttlSeconds
44+
_, err := leases.Update(ctx, lease, metav1.UpdateOptions{})
45+
if err != nil {
46+
return false, fmt.Errorf("failed to update lease: %w", err)
47+
}
48+
return true, nil
49+
}
50+
if lease.Spec.HolderIdentity != nil && *lease.Spec.HolderIdentity == holderIdentity {
51+
// Already held by us, renew
52+
now := metav1.MicroTime{Time: time.Now()}
53+
lease.Spec.RenewTime = &now
54+
_, err := leases.Update(ctx, lease, metav1.UpdateOptions{})
55+
if err != nil {
56+
return false, fmt.Errorf("failed to renew lease: %w", err)
57+
}
58+
return true, nil
59+
}
60+
// Held by someone else and not expired
61+
return false, nil
62+
}
63+
64+
func leaseExpired(lease *coordinationv1.Lease) bool {
65+
if lease.Spec.RenewTime == nil || lease.Spec.LeaseDurationSeconds == nil {
66+
return true
67+
}
68+
expiry := lease.Spec.RenewTime.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second)
69+
return time.Now().After(expiry)
70+
}
71+
72+
// ReleaseLease releases the Lease if held by holderIdentity
73+
func ReleaseLease(ctx context.Context, clientset *kubernetes.Clientset, leaseName, namespace, holderIdentity string) error {
74+
leases := clientset.CoordinationV1().Leases(namespace)
75+
lease, err := leases.Get(ctx, leaseName, metav1.GetOptions{})
76+
if err != nil {
77+
return nil // Already gone
78+
}
79+
if lease.Spec.HolderIdentity != nil && *lease.Spec.HolderIdentity == holderIdentity {
80+
// Remove holder
81+
empty := ""
82+
lease.Spec.HolderIdentity = &empty
83+
_, err := leases.Update(ctx, lease, metav1.UpdateOptions{})
84+
return err
85+
}
86+
return nil
87+
}

pkg/apis/hlf.kungfusoftware.es/v1alpha1/hlf_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,8 @@ type FabricPeerStatus struct {
557557
SignCACert string `json:"signCaCert"`
558558
// +optional
559559
NodePort int `json:"port"`
560+
// +optional
561+
CertRenewalLeaseHeld bool `json:"certRenewalLeaseHeld,omitempty"`
560562
}
561563
type OrdererService struct {
562564
// +kubebuilder:validation:Enum=NodePort;ClusterIP;LoadBalancer
@@ -768,6 +770,8 @@ type FabricOrdererNodeStatus struct {
768770
NodePort int `json:"port"`
769771
// +optional
770772
Message string `json:"message"`
773+
// +optional
774+
CertRenewalLeaseHeld bool `json:"certRenewalLeaseHeld,omitempty"`
771775
}
772776

773777
type Cors struct {

0 commit comments

Comments
 (0)