Skip to content

Commit 99d57a1

Browse files
committed
Improve user friendliness of healthcheck errors
1 parent 658a3c0 commit 99d57a1

File tree

1 file changed

+99
-21
lines changed

1 file changed

+99
-21
lines changed

pkg/healthcheck/workloads.go

Lines changed: 99 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ package healthcheck
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"path/filepath"
8+
"strings"
9+
"syscall"
710
"time"
811

912
"github.com/openshift/microshift/pkg/config"
@@ -25,13 +28,14 @@ type NamespaceWorkloads struct {
2528
}
2629

2730
func waitForWorkloads(ctx context.Context, timeout time.Duration, workloads map[string]NamespaceWorkloads) error {
28-
restConfig, err := clientcmd.BuildConfigFromFlags("", filepath.Join(config.DataDir, "resources", string(config.KubeAdmin), "kubeconfig"))
31+
kubeconfigPath := filepath.Join(config.DataDir, "resources", string(config.KubeAdmin), "kubeconfig")
32+
restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath)
2933
if err != nil {
30-
return fmt.Errorf("failed to create restConfig: %v", err)
34+
return fmt.Errorf("failed to load kubeconfig from %s: %v", kubeconfigPath, err)
3135
}
3236
client, err := appsclientv1.NewForConfig(rest.AddUserAgent(restConfig, "healthcheck"))
3337
if err != nil {
34-
return fmt.Errorf("failed to create client: %v", err)
38+
return fmt.Errorf("unable to create Kubernetes client: %v", err)
3539
}
3640

3741
interval := max(timeout/30, 1*time.Second)
@@ -59,33 +63,48 @@ func waitForWorkloads(ctx context.Context, timeout time.Duration, workloads map[
5963

6064
func waitForDaemonSet(ctx context.Context, client *appsclientv1.AppsV1Client, timeout, interval time.Duration, namespace, name string) error {
6165
klog.Infof("Waiting %v for daemonset/%s in %s", timeout, name, namespace)
66+
var lastHumanReadableErr error
6267
err := wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (done bool, err error) {
63-
ds, err := client.DaemonSets(namespace).Get(ctx, name, v1.GetOptions{})
68+
getctx, cancel := context.WithTimeout(ctx, interval/2)
69+
defer cancel()
70+
71+
ds, err := client.DaemonSets(namespace).Get(getctx, name, v1.GetOptions{})
6472
if err != nil {
65-
if apierrors.IsNotFound(err) {
66-
// Resources created by an operator might not exist yet.
67-
// We allow for full timeout duration to be created and become ready.
73+
// Always return 'false, nil' to keep retrying until timeout.
74+
75+
if commonErr := commonGetErrors(err); commonErr != nil {
76+
lastHumanReadableErr = commonErr
77+
return false, nil
78+
}
79+
if isDeadlineExceededError(err) {
6880
return false, nil
6981
}
70-
klog.Errorf("Error getting daemonset/%s in %q: %v", name, namespace, err)
71-
// Ignore errors, give chance until timeout
82+
83+
klog.Errorf("Unexpected error while getting daemonset %q in %q (ignoring): %v", name, namespace, err)
7284
return false, nil
7385
}
7486
klog.V(3).Infof("Status of daemonset/%s in %s: %+v", name, namespace, ds.Status)
7587

7688
// Borrowed and adjusted from k8s.io/kubectl/pkg/polymorphichelpers/rollout_status.go
7789
if ds.Generation > ds.Status.ObservedGeneration {
90+
lastHumanReadableErr = fmt.Errorf("daemonset is still being processed by the controller (generation %d > observed %d)", ds.Generation, ds.Status.ObservedGeneration)
7891
return false, nil
7992
}
8093
if ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled {
94+
lastHumanReadableErr = fmt.Errorf("only %d of %d nodes have the updated daemonset pods", ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled)
8195
return false, nil
8296
}
8397
if ds.Status.NumberAvailable < ds.Status.DesiredNumberScheduled {
98+
lastHumanReadableErr = fmt.Errorf("only %d of %d daemonset pods are ready across all nodes", ds.Status.NumberAvailable, ds.Status.DesiredNumberScheduled)
8499
return false, nil
85100
}
86101
return true, nil
87102
})
88103
if err != nil {
104+
if errors.Is(err, context.DeadlineExceeded) {
105+
klog.Errorf("DaemonSet %q in %q namespace didn't become ready in %v: %v", name, namespace, timeout, lastHumanReadableErr)
106+
return fmt.Errorf("daemonset '%s' in namespace '%s' failed to become ready within %v. Last status: %v", name, namespace, timeout, lastHumanReadableErr)
107+
}
89108
klog.Errorf("Failed waiting for daemonset/%s in %s: %v", name, namespace, err)
90109
return err
91110
}
@@ -95,22 +114,31 @@ func waitForDaemonSet(ctx context.Context, client *appsclientv1.AppsV1Client, ti
95114

96115
func waitForDeployment(ctx context.Context, client *appsclientv1.AppsV1Client, timeout, interval time.Duration, namespace, name string) error {
97116
klog.Infof("Waiting %v for deployment/%s in %s", timeout, name, namespace)
98-
deployment, err := client.Deployments(namespace).Get(ctx, name, v1.GetOptions{})
117+
var lastHumanReadableErr error
99118
err := wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (done bool, err error) {
119+
getctx, cancel := context.WithTimeout(ctx, interval/2)
120+
defer cancel()
121+
122+
deployment, err := client.Deployments(namespace).Get(getctx, name, v1.GetOptions{})
100123
if err != nil {
101-
if apierrors.IsNotFound(err) {
102-
// Resources created by an operator might not exist yet.
103-
// We allow for full timeout duration to be created and become ready.
124+
// Always return 'false, nil' to keep retrying until timeout.
125+
126+
if commonErr := commonGetErrors(err); commonErr != nil {
127+
lastHumanReadableErr = commonErr
104128
return false, nil
105129
}
106-
klog.Errorf("Error getting deployment/%s in %q: %v", name, namespace, err)
107-
// Ignore errors, give chance until timeout
130+
if isDeadlineExceededError(err) {
131+
return false, nil
132+
}
133+
134+
klog.Errorf("Unexpected error while getting deployment %q in %q (ignoring): %v", name, namespace, err)
108135
return false, nil
109136
}
110137
klog.V(3).Infof("Status of deployment/%s in %s: %+v", name, namespace, deployment.Status)
111138

112139
// Borrowed and adjusted from k8s.io/kubectl/pkg/polymorphichelpers/rollout_status.go
113140
if deployment.Generation > deployment.Status.ObservedGeneration {
141+
lastHumanReadableErr = fmt.Errorf("deployment is still being processed by the controller (generation %d > observed %d)", deployment.Generation, deployment.Status.ObservedGeneration)
114142
return false, nil
115143
}
116144
// 'rollout status' command would check the 'Progressing' condition and if the reason is 'ProgressDeadlineExceeded',
@@ -120,17 +148,24 @@ func waitForDeployment(ctx context.Context, client *appsclientv1.AppsV1Client, t
120148
// - we want to give full timeout duration for the Deployment to become ready, no early exits.
121149

122150
if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas {
151+
lastHumanReadableErr = fmt.Errorf("only %d of %d pods have been updated with the latest configuration", deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas)
123152
return false, nil
124153
}
125154
if deployment.Status.Replicas > deployment.Status.UpdatedReplicas {
155+
lastHumanReadableErr = fmt.Errorf("%d pods are still running the old configuration while %d are updated", deployment.Status.Replicas-deployment.Status.UpdatedReplicas, deployment.Status.UpdatedReplicas)
126156
return false, nil
127157
}
128158
if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas {
159+
lastHumanReadableErr = fmt.Errorf("only %d of %d updated pods are ready", deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas)
129160
return false, nil
130161
}
131162
return true, nil
132163
})
133164
if err != nil {
165+
if errors.Is(err, context.DeadlineExceeded) {
166+
klog.Errorf("Deployment/%s in %s didn't become ready in %v: %v", name, namespace, timeout, lastHumanReadableErr)
167+
return fmt.Errorf("deployment '%s' in namespace '%s' failed to become ready within %v. Last status: %v", name, namespace, timeout, lastHumanReadableErr)
168+
}
134169
klog.Errorf("Failed waiting for deployment/%s in %s: %v", name, namespace, err)
135170
return err
136171
}
@@ -140,44 +175,87 @@ func waitForDeployment(ctx context.Context, client *appsclientv1.AppsV1Client, t
140175

141176
func waitForStatefulSet(ctx context.Context, client *appsclientv1.AppsV1Client, timeout, interval time.Duration, namespace, name string) error {
142177
klog.Infof("Waiting %v for statefulset/%s in %s", timeout, name, namespace)
178+
var lastHumanReadableErr error
143179
err := wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (done bool, err error) {
144-
sts, err := client.StatefulSets(namespace).Get(ctx, name, v1.GetOptions{})
180+
getctx, cancel := context.WithTimeout(ctx, interval/2)
181+
defer cancel()
182+
183+
sts, err := client.StatefulSets(namespace).Get(getctx, name, v1.GetOptions{})
145184
if err != nil {
146-
if apierrors.IsNotFound(err) {
147-
// Resources created by an operator might not exist yet.
148-
// We allow for full timeout duration to be created and become ready.
185+
// Always return 'false, nil' to keep retrying until timeout.
186+
187+
if commonErr := commonGetErrors(err); commonErr != nil {
188+
lastHumanReadableErr = commonErr
189+
return false, nil
190+
}
191+
if isDeadlineExceededError(err) {
149192
return false, nil
150193
}
151-
klog.Errorf("Error getting statefulset/%s in %s: %v", name, namespace, err)
152-
// Ignore errors, give chance until timeout
194+
195+
klog.Errorf("Unexpected error while getting statefulset %q in %q (ignoring): %v", name, namespace, err)
153196
return false, nil
154197
}
155198
klog.V(3).Infof("Status of statefulset/%s in %s: %+v", name, namespace, sts.Status)
156199

157200
// Borrowed and adjusted from k8s.io/kubectl/pkg/polymorphichelpers/rollout_status.go
158201
if sts.Status.ObservedGeneration == 0 || sts.Generation > sts.Status.ObservedGeneration {
202+
lastHumanReadableErr = fmt.Errorf("statefulset is still being processed by the controller (generation %d > observed %d)", sts.Generation, sts.Status.ObservedGeneration)
159203
return false, nil
160204
}
161205
if sts.Spec.Replicas != nil && sts.Status.ReadyReplicas < *sts.Spec.Replicas {
206+
lastHumanReadableErr = fmt.Errorf("only %d of %d replicas are ready", sts.Status.ReadyReplicas, *sts.Spec.Replicas)
162207
return false, nil
163208
}
164209
if sts.Spec.UpdateStrategy.Type == appsv1.RollingUpdateStatefulSetStrategyType && sts.Spec.UpdateStrategy.RollingUpdate != nil {
165210
if sts.Spec.Replicas != nil && sts.Spec.UpdateStrategy.RollingUpdate.Partition != nil {
166211
if sts.Status.UpdatedReplicas < (*sts.Spec.Replicas - *sts.Spec.UpdateStrategy.RollingUpdate.Partition) {
212+
lastHumanReadableErr = fmt.Errorf("only %d of %d replicas have been updated (partition: %d)", sts.Status.UpdatedReplicas, *sts.Spec.Replicas, *sts.Spec.UpdateStrategy.RollingUpdate.Partition)
167213
return false, nil
168214
}
169215
}
170216
return true, nil
171217
}
172218
if sts.Status.UpdateRevision != sts.Status.CurrentRevision {
219+
lastHumanReadableErr = fmt.Errorf("update revision (%s) differs from current revision (%s)", sts.Status.UpdateRevision, sts.Status.CurrentRevision)
173220
return false, nil
174221
}
175222
return true, nil
176223
})
177224
if err != nil {
225+
if errors.Is(err, context.DeadlineExceeded) {
226+
klog.Errorf("Statefulset/%s in %s didn't become ready in %v: %v", name, namespace, timeout, lastHumanReadableErr)
227+
return fmt.Errorf("statefulset '%s' in namespace '%s' failed to become ready within %v. Last status: %v", name, namespace, timeout, lastHumanReadableErr)
228+
}
178229
klog.Errorf("Failed waiting for statefulset/%s in %s: %v", name, namespace, err)
179230
return err
180231
}
181232
klog.Infof("StatefulSet/%s in %s is ready", name, namespace)
182233
return nil
183234
}
235+
236+
func isDeadlineExceededError(err error) bool {
237+
if strings.Contains(err.Error(), "would exceed context deadline") {
238+
return true
239+
}
240+
241+
// 'client rate limiter Wait returned an error: context deadline exceeded' -> drop the wrapping errors
242+
if errors.Is(err, context.DeadlineExceeded) {
243+
return true
244+
}
245+
246+
return false
247+
}
248+
249+
func commonGetErrors(err error) error {
250+
if apierrors.IsNotFound(err) {
251+
// Resources created by an operator might not exist yet.
252+
// We allow for full timeout duration to be created and become ready.
253+
return fmt.Errorf("resource does not exist yet")
254+
}
255+
256+
if errors.Is(err, syscall.ECONNREFUSED) {
257+
return fmt.Errorf("cannot connect to API server")
258+
}
259+
260+
return nil
261+
}

0 commit comments

Comments
 (0)