Skip to content

Commit 947c437

Browse files
Merge pull request #724 from matysek/ols-1333
OLS-1333: Fix prometheus metrics test flakyness
2 parents 222ac2b + e425687 commit 947c437

File tree

3 files changed

+29
-13
lines changed

3 files changed

+29
-13
lines changed

test/e2e/client.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
logf "sigs.k8s.io/controller-runtime/pkg/log"
3131

3232
olsv1alpha1 "github.com/openshift/lightspeed-operator/api/v1alpha1"
33+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
3334
)
3435

3536
const (
@@ -427,7 +428,13 @@ func (c *Client) CreateClusterRoleBinding(namespace, serviceAccount, clusterRole
427428

428429
err := c.Create(clusterRoleBinding)
429430
if err != nil {
430-
return nil, err
431+
// ROSA clusters have more restricted permissions.
432+
// Deleting ClusterRoleBindings is not allowed.
433+
if k8serrors.IsAlreadyExists(err) {
434+
logf.Log.Error(err, "ClusterRoleBindings for test already exists (happens for ROSA clusters)")
435+
} else {
436+
return nil, err
437+
}
431438
}
432439

433440
return func() {

test/e2e/metrics_test.go

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import (
1313
"sigs.k8s.io/controller-runtime/pkg/client/config"
1414
)
1515

16-
var _ = Describe("Prometheus Metrics", Ordered, func() {
16+
// Try to retry test case multipletimes when failing.
17+
var _ = Describe("Prometheus Metrics", FlakeAttempts(5), Ordered, func() {
1718
const metricsViewerSAName = "metrics-viewer-sa"
1819
const clusterMonitoringViewClusterRole = "cluster-monitoring-view"
1920
var cr *olsv1alpha1.OLSConfig
@@ -36,32 +37,37 @@ var _ = Describe("Prometheus Metrics", Ordered, func() {
3637

3738
By("create a service account")
3839
cleanUp, err := client.CreateServiceAccount(OLSNameSpace, metricsViewerSAName)
39-
Expect(err).NotTo(HaveOccurred())
4040
cleanUpFuncs = append(cleanUpFuncs, cleanUp)
41+
Expect(err).NotTo(HaveOccurred())
4142

4243
By("create a role binding for application metrics access")
4344
cleanUp, err = client.CreateClusterRoleBinding(OLSNameSpace, metricsViewerSAName, clusterMonitoringViewClusterRole)
44-
Expect(err).NotTo(HaveOccurred())
4545
cleanUpFuncs = append(cleanUpFuncs, cleanUp)
46+
Expect(err).NotTo(HaveOccurred())
4647

4748
By("fetch the service account token")
4849
saToken, err = client.GetServiceAccountToken(OLSNameSpace, metricsViewerSAName)
4950
Expect(err).NotTo(HaveOccurred())
5051

51-
// Get a Kubernetes rest config
52+
By("fetch a Kubernetes rest config")
5253
cfg, err := config.GetConfig()
5354
Expect(err).NotTo(HaveOccurred())
5455

5556
openshiftRouteClient, err := routev1.NewForConfig(cfg)
5657
Expect(err).NotTo(HaveOccurred())
5758

58-
prometheusClient, err = NewPrometheusClientFromRoute(
59-
context.Background(),
60-
openshiftRouteClient,
61-
"openshift-monitoring", "thanos-querier",
62-
saToken,
63-
)
64-
Expect(err).NotTo(HaveOccurred())
59+
By("fetch a Prometheus route")
60+
// Retry multiple times to fetch route
61+
// mitigates networking issues
62+
Eventually(func() error {
63+
prometheusClient, err = NewPrometheusClientFromRoute(
64+
context.Background(),
65+
openshiftRouteClient,
66+
"openshift-monitoring", "thanos-querier",
67+
saToken,
68+
)
69+
return err
70+
}, 10*time.Second, 2*time.Second).ShouldNot(HaveOccurred())
6571

6672
By("wait for application server deployment rollout")
6773
deployment := &appsv1.Deployment{

test/e2e/prometheus_client.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616

1717
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1818
"k8s.io/apimachinery/pkg/util/wait"
19+
logf "sigs.k8s.io/controller-runtime/pkg/log"
1920
)
2021

2122
// PrometheusClient provides access to the Prometheus, Thanos & Alertmanager API.
@@ -30,7 +31,7 @@ type PrometheusClient struct {
3031

3132
// DefaultPollInterval is the default interval for polling Prometheus metrics.
3233
// Prometheus metrics are typically scraped every 30 seconds. This is enough for 3 scrapes.
33-
const DefaultPrometheusQueryTimeout = 95 * time.Second
34+
const DefaultPrometheusQueryTimeout = 120 * time.Second
3435

3536
// NewPrometheusClientFromRoute creates and returns a new PrometheusClient from the given OpenShift route.
3637
func NewPrometheusClientFromRoute(
@@ -41,6 +42,8 @@ func NewPrometheusClientFromRoute(
4142
) (*PrometheusClient, error) {
4243
route, err := routeClient.Routes(namespace).Get(ctx, name, metav1.GetOptions{})
4344
if err != nil {
45+
46+
logf.Log.Error(err, "Error fetching Prometheus route")
4447
return nil, err
4548
}
4649

0 commit comments

Comments
 (0)