Skip to content

Commit 3049396

Browse files
authored
fix(multicluster): Do not ignore timeout and gateway metrics errors during multicluster check (#14418)
Signed-off-by: Zahari Dichev <[email protected]>
1 parent 554776d commit 3049396

File tree

2 files changed

+23
-6
lines changed

2 files changed

+23
-6
lines changed

multicluster/cmd/check.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -730,13 +730,25 @@ func (hc *healthChecker) checkIfGatewayMirrorsHaveEndpoints(ctx context.Context,
730730

731731
// Get and parse the gateway metrics so that we can extract liveness
732732
// information.
733-
gatewayMetrics := getGatewayMetrics(hc.KubeAPIClient(), pods.Items, leaders, wait)
733+
gatewayMetrics, err := getGatewayMetrics(hc.KubeAPIClient(), pods.Items, leaders, wait)
734+
if err != nil {
735+
errors = append(errors, fmt.Errorf("failed to get gateway metrics for target cluster %s: %w", link.Spec.TargetClusterName, err))
736+
continue
737+
}
738+
734739
if len(gatewayMetrics) != 1 {
735740
errors = append(errors, fmt.Errorf("expected exactly one gateway metric for target cluster %s; got %d", link.Spec.TargetClusterName, len(gatewayMetrics)))
736741
continue
737742
}
743+
744+
gatewayMetric := gatewayMetrics[0]
745+
if gatewayMetric.err != nil {
746+
errors = append(errors, fmt.Errorf("Failed to get gateway status for %s: %w\n", gatewayMetric.clusterName, gatewayMetric.err))
747+
continue
748+
}
749+
738750
var metricsParser expfmt.TextParser
739-
parsedMetrics, err := metricsParser.TextToMetricFamilies(bytes.NewReader(gatewayMetrics[0].metrics))
751+
parsedMetrics, err := metricsParser.TextToMetricFamilies(bytes.NewReader(gatewayMetric.metrics))
740752
if err != nil {
741753
errors = append(errors, fmt.Errorf("failed to parse gateway metrics for target cluster %s: %w", link.Spec.TargetClusterName, err))
742754
continue

multicluster/cmd/gateways.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,12 @@ func newGatewaysCommand() *cobra.Command {
9898
}
9999

100100
var statuses []gatewayStatus
101-
gatewayMetrics := getGatewayMetrics(k8sAPI, pods.Items, leaders, opts.wait)
101+
gatewayMetrics, err := getGatewayMetrics(k8sAPI, pods.Items, leaders, opts.wait)
102+
if err != nil {
103+
fmt.Fprintf(os.Stderr, "Failed to get gateway metrics for cluster %s: %s\n", opts.clusterName, err)
104+
os.Exit(1)
105+
}
106+
102107
for _, gateway := range gatewayMetrics {
103108
if gateway.err != nil {
104109
fmt.Fprintf(os.Stderr, "Failed to get gateway status for %s: %s\n", gateway.clusterName, gateway.err)
@@ -195,7 +200,7 @@ func newGatewaysCommand() *cobra.Command {
195200
return cmd
196201
}
197202

198-
func getGatewayMetrics(k8sAPI *k8s.KubernetesAPI, pods []corev1.Pod, leaders map[string]struct{}, wait time.Duration) []gatewayMetrics {
203+
func getGatewayMetrics(k8sAPI *k8s.KubernetesAPI, pods []corev1.Pod, leaders map[string]struct{}, wait time.Duration) ([]gatewayMetrics, error) {
199204
var metrics []gatewayMetrics
200205
metricsChan := make(chan gatewayMetrics)
201206
var wg sync.WaitGroup
@@ -250,11 +255,11 @@ wait:
250255
}
251256
metrics = append(metrics, metric)
252257
case <-timeout.C:
253-
break wait
258+
return nil, fmt.Errorf("timed out waiting for metrics")
254259
}
255260
}
256261

257-
return metrics
262+
return metrics, nil
258263
}
259264

260265
func getServiceMirrorContainer(pod corev1.Pod) (corev1.Container, error) {

0 commit comments

Comments
 (0)