diff --git a/pkg/controller/metrics/metrics.go b/pkg/controller/metrics/metrics.go new file mode 100644 index 0000000000..c04c8aaad0 --- /dev/null +++ b/pkg/controller/metrics/metrics.go @@ -0,0 +1,195 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + internalmetrics "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // reconcileTotal is a prometheus counter metrics which holds the total + // number of reconciliations per controller. It has two labels. controller label refers + // to the controller name and result label refers to the reconcile result i.e + // success, error, requeue, requeue_after. + reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_total", + Help: "Total number of reconciliations per controller", + }, []string{"controller", "result"}) + + // reconcileErrors is a prometheus counter metrics which holds the total + // number of errors from the Reconciler. + reconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_errors_total", + Help: "Total number of reconciliation errors per controller", + }, []string{"controller"}) + + // terminalReconcileErrors is a prometheus counter metrics which holds the total + // number of terminal errors from the Reconciler. + terminalReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_terminal_reconcile_errors_total", + Help: "Total number of terminal reconciliation errors per controller", + }, []string{"controller"}) + + // reconcilePanics is a prometheus counter metrics which holds the total + // number of panics from the Reconciler. + reconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_panics_total", + Help: "Total number of reconciliation panics per controller", + }, []string{"controller"}) + + // reconcileTime is a prometheus metric which keeps track of the duration + // of reconciliations. + reconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "controller_runtime_reconcile_time_seconds", + Help: "Length of time per reconciliation per controller", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"controller"}) + + // workerCount is a prometheus metric which holds the number of + // concurrent reconciles per controller. + workerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_max_concurrent_reconciles", + Help: "Maximum number of concurrent reconciles per controller", + }, []string{"controller"}) + + // activeWorkers is a prometheus metric which holds the number + // of active workers per controller. + activeWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_active_workers", + Help: "Number of currently used workers per controller", + }, []string{"controller"}) +) + +// ControllerMetricsProvider is an interface that provides methods for firing controller metrics +type ControllerMetricsProvider interface { + // ReconcileTotal is a prometheus counter metrics which holds the total + // number of reconciliations per controller. It has two labels. controller label refers + // to the controller name and result label refers to the reconcile result i.e + // success, error, requeue, requeue_after. + ReconcileTotal() internalmetrics.CounterMetric + // ReconcileErrors is a prometheus counter metrics which holds the total + // number of errors from the Reconciler. + ReconcileErrors() internalmetrics.CounterMetric + // TerminalReconcileErrors is a prometheus counter metrics which holds the total + // number of terminal errors from the Reconciler. + TerminalReconcileErrors() internalmetrics.CounterMetric + // ReconcilePanics is a prometheus counter metrics which holds the total + // number of panics from the Reconciler. + ReconcilePanics() internalmetrics.CounterMetric + // ReconcileTime is a prometheus metric which keeps track of the duration + // of reconciliations. + ReconcileTime() internalmetrics.HistogramMetric + // WorkerCount is a prometheus metric which holds the number of + // concurrent reconciles per controller. + WorkerCount() internalmetrics.GaugeMetric + // ActiveWorkers is a prometheus metric which holds the number + // of active workers per controller. + ActiveWorkers() internalmetrics.GaugeMetric +} + +// PrometheusProvider is a metrics.ControllerMetricsProvider and a metrics.LeaderElectionMetricsProvider +// that registers and fires prometheus metrics in response to leader election and controller events +type PrometheusProvider struct { + reconcileTotal *prometheus.CounterVec + reconcileErrors *prometheus.CounterVec + terminalReconcileErrors *prometheus.CounterVec + reconcilePanics *prometheus.CounterVec + reconcileTime *prometheus.HistogramVec + workerCount *prometheus.GaugeVec + activeWorkers *prometheus.GaugeVec +} + +// NewPrometheusProvider creates a PrometheusProvider +func NewPrometheusProvider() *PrometheusProvider { + return &PrometheusProvider{ + reconcileTotal: reconcileTotal, + reconcileErrors: reconcileErrors, + terminalReconcileErrors: terminalReconcileErrors, + reconcilePanics: reconcilePanics, + reconcileTime: reconcileTime, + workerCount: workerCount, + activeWorkers: activeWorkers, + } +} + +// ReconcileTotal returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcileTotal() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcileTotal} +} + +// ReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcileErrors() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcileErrors} +} + +// TerminalReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) TerminalReconcileErrors() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.terminalReconcileErrors} +} + +// ReconcilePanics returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusProvider) ReconcilePanics() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcilePanics} +} + +// ReconcileTime returns a Prometheus histogram that fulfills the ObservationMetric interface +func (p PrometheusProvider) ReconcileTime() internalmetrics.HistogramMetric { + return &internalmetrics.PrometheusHistogramAdapter{HistogramVec: p.reconcileTime} +} + +// WorkerCount returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusProvider) WorkerCount() internalmetrics.GaugeMetric { + return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.workerCount} +} + +// ActiveWorkers returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusProvider) ActiveWorkers() internalmetrics.GaugeMetric { + return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.activeWorkers} +} + +func init() { + metrics.Registry.MustRegister( + reconcileTotal, + reconcileErrors, + terminalReconcileErrors, + reconcilePanics, + reconcileTime, + workerCount, + activeWorkers, + ) +} + +var controllerMetricsProvider ControllerMetricsProvider = NewPrometheusProvider() + +// SetControllerMetricsProvider assigns a provider to the ControllerMetricsProvider for exposing controller metrics. +// The PrometheusProvider will be used by default if the provider is not overridden +func SetControllerMetricsProvider(provider ControllerMetricsProvider) { + controllerMetricsProvider = provider +} + +// GetControllerMetricsProvider returns the controller metrics provider being used by the controller reconciliation +func GetControllerMetricsProvider() ControllerMetricsProvider { + return controllerMetricsProvider +} diff --git a/pkg/controller/priorityqueue/metrics.go b/pkg/controller/priorityqueue/metrics.go index 967a252dfb..d84a9b19c6 100644 --- a/pkg/controller/priorityqueue/metrics.go +++ b/pkg/controller/priorityqueue/metrics.go @@ -6,7 +6,7 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" - "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) // This file is mostly a copy of unexported code from diff --git a/pkg/controller/priorityqueue/metrics_test.go b/pkg/controller/priorityqueue/metrics_test.go index 3be3989d89..7292f5d81d 100644 --- a/pkg/controller/priorityqueue/metrics_test.go +++ b/pkg/controller/priorityqueue/metrics_test.go @@ -4,7 +4,7 @@ import ( "sync" "k8s.io/client-go/util/workqueue" - "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) func newFakeMetricsProvider() *fakeMetricsProvider { diff --git a/pkg/controller/priorityqueue/priorityqueue.go b/pkg/controller/priorityqueue/priorityqueue.go index c3f77a6f39..29e6321d1a 100644 --- a/pkg/controller/priorityqueue/priorityqueue.go +++ b/pkg/controller/priorityqueue/priorityqueue.go @@ -12,7 +12,7 @@ import ( "k8s.io/utils/clock" "k8s.io/utils/ptr" - "sigs.k8s.io/controller-runtime/pkg/internal/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) // AddOpts describes the options for adding items to the queue. @@ -56,7 +56,7 @@ func New[T comparable](name string, o ...Opt[T]) PriorityQueue[T] { } if opts.MetricProvider == nil { - opts.MetricProvider = metrics.WorkqueueMetricsProvider{} + opts.MetricProvider = metrics.PrometheusWorkqueueMetricsProvider{} } pq := &priorityqueue[T]{ diff --git a/pkg/internal/controller/controller.go b/pkg/internal/controller/controller.go index 9fa7ec71e1..85d8b07072 100644 --- a/pkg/internal/controller/controller.go +++ b/pkg/internal/controller/controller.go @@ -30,9 +30,9 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/client-go/util/workqueue" + "sigs.k8s.io/controller-runtime/pkg/controller/metrics" "sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue" - ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" @@ -101,7 +101,7 @@ type Controller[request comparable] struct { func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ reconcile.Result, err error) { defer func() { if r := recover(); r != nil { - ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Inc() + metrics.GetControllerMetricsProvider().ReconcilePanics().Inc(map[string]string{labelKeyController: c.Name}) if c.RecoverPanic == nil || *c.RecoverPanic { for _, fn := range utilruntime.PanicHandlers { @@ -294,30 +294,32 @@ func (c *Controller[request]) processNextWorkItem(ctx context.Context) bool { // period. defer c.Queue.Done(obj) - ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Add(1) - defer ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Add(-1) + metrics.GetControllerMetricsProvider().ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, 1) + defer metrics.GetControllerMetricsProvider().ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, -1) c.reconcileHandler(ctx, obj, priority) return true } const ( - labelError = "error" - labelRequeueAfter = "requeue_after" - labelRequeue = "requeue" - labelSuccess = "success" + labelKeyController = "controller" + labelKeyResult = "result" + labelError = "error" + labelRequeueAfter = "requeue_after" + labelRequeue = "requeue" + labelSuccess = "success" ) func (c *Controller[request]) initMetrics() { - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Add(0) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Add(0) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Add(0) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Add(0) - ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0) - ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0) - ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0) - ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles)) - ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}, 0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}, 0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}, 0) + metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}, 0) + metrics.GetControllerMetricsProvider().ReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) + metrics.GetControllerMetricsProvider().TerminalReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0) + metrics.GetControllerMetricsProvider().ReconcilePanics().Add(map[string]string{labelKeyController: c.Name}, 0) + metrics.GetControllerMetricsProvider().WorkerCount().Set(map[string]string{labelKeyController: c.Name}, float64(c.MaxConcurrentReconciles)) + metrics.GetControllerMetricsProvider().ActiveWorkers().Set(map[string]string{labelKeyController: c.Name}, 0) } func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, priority int) { @@ -341,12 +343,12 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, switch { case err != nil: if errors.Is(err, reconcile.TerminalError(nil)) { - ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Inc() + metrics.GetControllerMetricsProvider().TerminalReconcileErrors().Inc(map[string]string{"controller": c.Name}) } else { c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req) } - ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Inc() - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Inc() + metrics.GetControllerMetricsProvider().ReconcileErrors().Inc(map[string]string{labelKeyController: c.Name}) + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}) if !result.IsZero() { log.Info("Warning: Reconciler returned both a non-zero result and a non-nil error. The result will always be ignored if the error is non-nil and the non-nil error causes requeuing with exponential backoff. For more details, see: https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/reconcile#Reconciler") } @@ -359,17 +361,17 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, // to result.RequestAfter c.Queue.Forget(req) c.Queue.AddWithOpts(priorityqueue.AddOpts{After: result.RequeueAfter, Priority: priority}, req) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Inc() + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}) case result.Requeue: //nolint: staticcheck // We have to handle it until it is removed log.V(5).Info("Reconcile done, requeueing") c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Inc() + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}) default: log.V(5).Info("Reconcile successful") // Finally, if no error occurs we Forget this item so it does not // get queued again until another change happens. c.Queue.Forget(req) - ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Inc() + metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}) } } @@ -380,7 +382,7 @@ func (c *Controller[request]) GetLogger() logr.Logger { // updateMetrics updates prometheus metrics within the controller. func (c *Controller[request]) updateMetrics(reconcileTime time.Duration) { - ctrlmetrics.ReconcileTime.WithLabelValues(c.Name).Observe(reconcileTime.Seconds()) + metrics.GetControllerMetricsProvider().ReconcileTime().Observe(map[string]string{labelKeyController: c.Name}, reconcileTime.Seconds()) } // ReconcileIDFromContext gets the reconcileID from the current context. diff --git a/pkg/internal/controller/controller_test.go b/pkg/internal/controller/controller_test.go index 3fde5da9c8..e8362b34e4 100644 --- a/pkg/internal/controller/controller_test.go +++ b/pkg/internal/controller/controller_test.go @@ -38,11 +38,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache/informertest" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllertest" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/controller/metrics" "sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" - ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics" "sigs.k8s.io/controller-runtime/pkg/internal/log" + intmetrics "sigs.k8s.io/controller-runtime/pkg/internal/metrics" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" ) @@ -810,13 +811,13 @@ var _ = Describe("controller", func() { var reconcileTotal dto.Metric BeforeEach(func() { - ctrlmetrics.ReconcileTotal.Reset() + ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).CounterVec.Reset() reconcileTotal.Reset() }) It("should get updated on successful reconciliation", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -835,7 +836,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{}, nil) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -845,7 +846,7 @@ var _ = Describe("controller", func() { It("should get updated on reconcile errors", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -864,7 +865,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{}, fmt.Errorf("expected error: reconcile")) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -874,7 +875,7 @@ var _ = Describe("controller", func() { It("should get updated when reconcile returns with retry enabled", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "retry").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "retry").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -894,7 +895,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{Requeue: true}, nil) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "requeue").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "requeue").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -904,7 +905,7 @@ var _ = Describe("controller", func() { It("should get updated when reconcile returns with retryAfter enabled", func() { Expect(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "retry_after").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "retry_after").Write(&reconcileTotal)).To(Succeed()) if reconcileTotal.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile total not reset") } @@ -923,7 +924,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{RequeueAfter: 5 * time.Hour}, nil) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "requeue_after").Write(&reconcileTotal)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileTotal().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name, "requeue_after").Write(&reconcileTotal)).To(Succeed()) if actual := reconcileTotal.GetCounter().GetValue(); actual != 1.0 { return fmt.Errorf("metric reconcile total expected: %v and got: %v", 1.0, actual) } @@ -935,9 +936,9 @@ var _ = Describe("controller", func() { Context("should update prometheus metrics", func() { It("should requeue a Request if there is an error and continue processing items", func() { var reconcileErrs dto.Metric - ctrlmetrics.ReconcileErrors.Reset() + ctrlmetrics.GetControllerMetricsProvider().ReconcileErrors().(*intmetrics.PrometheusCounterAdapter).Reset() Expect(func() error { - Expect(ctrlmetrics.ReconcileErrors.WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileErrors().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) if reconcileErrs.GetCounter().GetValue() != 0.0 { return fmt.Errorf("metric reconcile errors not reset") } @@ -956,7 +957,7 @@ var _ = Describe("controller", func() { fakeReconcile.AddResult(reconcile.Result{}, fmt.Errorf("expected error: reconcile")) Expect(<-reconciled).To(Equal(request)) Eventually(func() error { - Expect(ctrlmetrics.ReconcileErrors.WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) + Expect(ctrlmetrics.GetControllerMetricsProvider().ReconcileErrors().(*intmetrics.PrometheusCounterAdapter).WithLabelValues(ctrl.Name).Write(&reconcileErrs)).To(Succeed()) if reconcileErrs.GetCounter().GetValue() != 1.0 { return fmt.Errorf("metrics not updated") } @@ -974,10 +975,10 @@ var _ = Describe("controller", func() { It("should add a reconcile time to the reconcile time histogram", func() { var reconcileTime dto.Metric - ctrlmetrics.ReconcileTime.Reset() + ctrlmetrics.GetControllerMetricsProvider().ReconcileTime().(*intmetrics.PrometheusHistogramAdapter).Reset() Expect(func() error { - histObserver := ctrlmetrics.ReconcileTime.WithLabelValues(ctrl.Name) + histObserver := ctrlmetrics.GetControllerMetricsProvider().ReconcileTime().(*intmetrics.PrometheusHistogramAdapter).WithLabelValues(ctrl.Name) hist := histObserver.(prometheus.Histogram) Expect(hist.Write(&reconcileTime)).To(Succeed()) if reconcileTime.GetHistogram().GetSampleCount() != uint64(0) { @@ -1003,7 +1004,7 @@ var _ = Describe("controller", func() { Eventually(func() int { return queue.NumRequeues(request) }).Should(Equal(0)) Eventually(func() error { - histObserver := ctrlmetrics.ReconcileTime.WithLabelValues(ctrl.Name) + histObserver := ctrlmetrics.GetControllerMetricsProvider().ReconcileTime().(*intmetrics.PrometheusHistogramAdapter).WithLabelValues(ctrl.Name) hist := histObserver.(prometheus.Histogram) Expect(hist.Write(&reconcileTime)).To(Succeed()) if reconcileTime.GetHistogram().GetSampleCount() == uint64(0) { diff --git a/pkg/internal/controller/metrics/metrics.go b/pkg/internal/controller/metrics/metrics.go deleted file mode 100644 index 450e9ae25b..0000000000 --- a/pkg/internal/controller/metrics/metrics.go +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/collectors" - "sigs.k8s.io/controller-runtime/pkg/metrics" -) - -var ( - // ReconcileTotal is a prometheus counter metrics which holds the total - // number of reconciliations per controller. It has two labels. controller label refers - // to the controller name and result label refers to the reconcile result i.e - // success, error, requeue, requeue_after. - ReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_reconcile_total", - Help: "Total number of reconciliations per controller", - }, []string{"controller", "result"}) - - // ReconcileErrors is a prometheus counter metrics which holds the total - // number of errors from the Reconciler. - ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_reconcile_errors_total", - Help: "Total number of reconciliation errors per controller", - }, []string{"controller"}) - - // TerminalReconcileErrors is a prometheus counter metrics which holds the total - // number of terminal errors from the Reconciler. - TerminalReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_terminal_reconcile_errors_total", - Help: "Total number of terminal reconciliation errors per controller", - }, []string{"controller"}) - - // ReconcilePanics is a prometheus counter metrics which holds the total - // number of panics from the Reconciler. - ReconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "controller_runtime_reconcile_panics_total", - Help: "Total number of reconciliation panics per controller", - }, []string{"controller"}) - - // ReconcileTime is a prometheus metric which keeps track of the duration - // of reconciliations. - ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "controller_runtime_reconcile_time_seconds", - Help: "Length of time per reconciliation per controller", - Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, - 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, - NativeHistogramBucketFactor: 1.1, - NativeHistogramMaxBucketNumber: 100, - NativeHistogramMinResetDuration: 1 * time.Hour, - }, []string{"controller"}) - - // WorkerCount is a prometheus metric which holds the number of - // concurrent reconciles per controller. - WorkerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "controller_runtime_max_concurrent_reconciles", - Help: "Maximum number of concurrent reconciles per controller", - }, []string{"controller"}) - - // ActiveWorkers is a prometheus metric which holds the number - // of active workers per controller. - ActiveWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "controller_runtime_active_workers", - Help: "Number of currently used workers per controller", - }, []string{"controller"}) -) - -func init() { - metrics.Registry.MustRegister( - ReconcileTotal, - ReconcileErrors, - TerminalReconcileErrors, - ReconcilePanics, - ReconcileTime, - WorkerCount, - ActiveWorkers, - // expose process metrics like CPU, Memory, file descriptor usage etc. - collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), - // expose all Go runtime metrics like GC stats, memory stats etc. - collectors.NewGoCollector(collectors.WithGoCollectorRuntimeMetrics(collectors.MetricsAll)), - ) -} diff --git a/pkg/internal/metrics/metrics.go b/pkg/internal/metrics/metrics.go new file mode 100644 index 0000000000..826c7d75f0 --- /dev/null +++ b/pkg/internal/metrics/metrics.go @@ -0,0 +1,68 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +// HistogramMetric is a metric that stores the set of observed values +type HistogramMetric interface { + Observe(map[string]string, float64) +} + +// GaugeMetric is a metric that gets set and can be changed dynamically at runtime +type GaugeMetric interface { + Set(map[string]string, float64) + Add(map[string]string, float64) +} + +// CounterMetric is a metric that gets incremented monotonically +type CounterMetric interface { + Inc(map[string]string) + Add(map[string]string, float64) +} + +type PrometheusCounterAdapter struct { + *prometheus.CounterVec +} + +func (p *PrometheusCounterAdapter) Inc(labels map[string]string) { + p.With(labels).Inc() +} + +func (p *PrometheusCounterAdapter) Add(labels map[string]string, val float64) { + p.With(labels).Add(val) +} + +type PrometheusGaugeAdapter struct { + *prometheus.GaugeVec +} + +func (p *PrometheusGaugeAdapter) Set(labels map[string]string, val float64) { + p.With(labels).Set(val) +} + +func (p *PrometheusGaugeAdapter) Add(labels map[string]string, val float64) { + p.With(labels).Add(val) +} + +type PrometheusHistogramAdapter struct { + *prometheus.HistogramVec +} + +func (p *PrometheusHistogramAdapter) Observe(labels map[string]string, val float64) { + p.With(labels).Observe(val) +} diff --git a/pkg/internal/metrics/workqueue.go b/pkg/internal/metrics/workqueue.go deleted file mode 100644 index 402319817b..0000000000 --- a/pkg/internal/metrics/workqueue.go +++ /dev/null @@ -1,170 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "strconv" - "time" - - "github.com/prometheus/client_golang/prometheus" - "k8s.io/client-go/util/workqueue" - "sigs.k8s.io/controller-runtime/pkg/metrics" -) - -// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/workqueue -// which registers metrics to the k8s legacy Registry. We require very -// similar functionality, but must register metrics to a different Registry. - -// Metrics subsystem and all keys used by the workqueue. -const ( - WorkQueueSubsystem = metrics.WorkQueueSubsystem - DepthKey = metrics.DepthKey - AddsKey = metrics.AddsKey - QueueLatencyKey = metrics.QueueLatencyKey - WorkDurationKey = metrics.WorkDurationKey - UnfinishedWorkKey = metrics.UnfinishedWorkKey - LongestRunningProcessorKey = metrics.LongestRunningProcessorKey - RetriesKey = metrics.RetriesKey -) - -var ( - depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: DepthKey, - Help: "Current depth of workqueue by workqueue and priority", - }, []string{"name", "controller", "priority"}) - - adds = prometheus.NewCounterVec(prometheus.CounterOpts{ - Subsystem: WorkQueueSubsystem, - Name: AddsKey, - Help: "Total number of adds handled by workqueue", - }, []string{"name", "controller"}) - - latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Subsystem: WorkQueueSubsystem, - Name: QueueLatencyKey, - Help: "How long in seconds an item stays in workqueue before being requested", - Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), - NativeHistogramBucketFactor: 1.1, - NativeHistogramMaxBucketNumber: 100, - NativeHistogramMinResetDuration: 1 * time.Hour, - }, []string{"name", "controller"}) - - workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Subsystem: WorkQueueSubsystem, - Name: WorkDurationKey, - Help: "How long in seconds processing an item from workqueue takes.", - Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), - NativeHistogramBucketFactor: 1.1, - NativeHistogramMaxBucketNumber: 100, - NativeHistogramMinResetDuration: 1 * time.Hour, - }, []string{"name", "controller"}) - - unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: UnfinishedWorkKey, - Help: "How many seconds of work has been done that " + - "is in progress and hasn't been observed by work_duration. Large " + - "values indicate stuck threads. One can deduce the number of stuck " + - "threads by observing the rate at which this increases.", - }, []string{"name", "controller"}) - - longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: LongestRunningProcessorKey, - Help: "How many seconds has the longest running " + - "processor for workqueue been running.", - }, []string{"name", "controller"}) - - retries = prometheus.NewCounterVec(prometheus.CounterOpts{ - Subsystem: WorkQueueSubsystem, - Name: RetriesKey, - Help: "Total number of retries handled by workqueue", - }, []string{"name", "controller"}) -) - -func init() { - metrics.Registry.MustRegister(depth) - metrics.Registry.MustRegister(adds) - metrics.Registry.MustRegister(latency) - metrics.Registry.MustRegister(workDuration) - metrics.Registry.MustRegister(unfinished) - metrics.Registry.MustRegister(longestRunningProcessor) - metrics.Registry.MustRegister(retries) - - workqueue.SetProvider(WorkqueueMetricsProvider{}) -} - -type WorkqueueMetricsProvider struct{} - -func (WorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric { - return depth.WithLabelValues(name, name, "") // no priority -} - -func (WorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric { - return adds.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric { - return latency.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric { - return workDuration.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric { - return unfinished.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric { - return longestRunningProcessor.WithLabelValues(name, name) -} - -func (WorkqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric { - return retries.WithLabelValues(name, name) -} - -type MetricsProviderWithPriority interface { - workqueue.MetricsProvider - - NewDepthMetricWithPriority(name string) DepthMetricWithPriority -} - -// DepthMetricWithPriority represents a depth metric with priority. -type DepthMetricWithPriority interface { - Inc(priority int) - Dec(priority int) -} - -var _ MetricsProviderWithPriority = WorkqueueMetricsProvider{} - -func (WorkqueueMetricsProvider) NewDepthMetricWithPriority(name string) DepthMetricWithPriority { - return &depthWithPriorityMetric{lvs: []string{name, name}} -} - -type depthWithPriorityMetric struct { - lvs []string -} - -func (g *depthWithPriorityMetric) Inc(priority int) { - depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Inc() -} - -func (g *depthWithPriorityMetric) Dec(priority int) { - depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Dec() -} diff --git a/pkg/manager/internal.go b/pkg/manager/internal.go index e5204a7506..e47f3082d2 100644 --- a/pkg/manager/internal.go +++ b/pkg/manager/internal.go @@ -35,7 +35,6 @@ import ( "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" diff --git a/pkg/metrics/leaderelection.go b/pkg/metrics/leaderelection.go index 61e1009d32..89f500b78d 100644 --- a/pkg/metrics/leaderelection.go +++ b/pkg/metrics/leaderelection.go @@ -1,20 +1,32 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package metrics import ( "github.com/prometheus/client_golang/prometheus" "k8s.io/client-go/tools/leaderelection" + internalmetrics "sigs.k8s.io/controller-runtime/pkg/internal/metrics" ) -// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/clientgo/leaderelection -// which registers metrics to the k8s legacy Registry. We require very -// similar functionality, but must register metrics to a different Registry. - var ( leaderGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "leader_election_master_status", Help: "Gauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.", }, []string{"name"}) - leaderSlowpathCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "leader_election_slowpath_total", Help: "Total number of slow path exercised in renewing leader leases. 'name' is the string used to identify the lease. Please make sure to group by name.", @@ -22,26 +34,74 @@ var ( ) func init() { - Registry.MustRegister(leaderGauge) - leaderelection.SetProvider(leaderelectionMetricsProvider{}) + Registry.MustRegister(leaderGauge, leaderSlowpathCounter) + SetLeaderElectionMetricsProvider(NewPrometheusLeaderElectionMetricsProvider()) } -type leaderelectionMetricsProvider struct{} +var leaderElectionMetricsProvider LeaderElectionMetricsProvider + +// SetLeaderElectionMetricsProvider sets the leader election provider leveraged by client-go +func SetLeaderElectionMetricsProvider(provider LeaderElectionMetricsProvider) { + leaderElectionMetricsProvider = provider + leaderelection.SetProvider(leaderElectionMetricsInternalProvider{provider: provider}) +} + +// GetLeaderElectionMetricsProvider returns the leader election metrics provider +func GetLeaderElectionMetricsProvider() LeaderElectionMetricsProvider { + return leaderElectionMetricsProvider +} + +// LeaderElectionMetricsProvider is an interface that provides methods for firing leader election metrics +type LeaderElectionMetricsProvider interface { + LeaderGauge() internalmetrics.GaugeMetric + SlowpathExercised() internalmetrics.CounterMetric +} -func (leaderelectionMetricsProvider) NewLeaderMetric() leaderelection.LeaderMetric { - return leaderElectionPrometheusAdapter{} +// PrometheusLeaderElectionMetricsProvider is a metrics.LeaderElectionMetricsProvider +// that fires prometheus metrics in response to leader election and controller events +type PrometheusLeaderElectionMetricsProvider struct { + leaderGauge *prometheus.GaugeVec + leaderSlowpathCounter *prometheus.CounterVec } -type leaderElectionPrometheusAdapter struct{} +// NewPrometheusLeaderElectionMetricsProvider creates a PrometheusLeaderElectionMetricsProvider +func NewPrometheusLeaderElectionMetricsProvider() *PrometheusLeaderElectionMetricsProvider { + return &PrometheusLeaderElectionMetricsProvider{ + leaderGauge: leaderGauge, + leaderSlowpathCounter: leaderSlowpathCounter, + } +} + +// LeaderGauge returns a Prometheus gauge that fulfills the GaugeMetric interface +func (p PrometheusLeaderElectionMetricsProvider) LeaderGauge() internalmetrics.GaugeMetric { + return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.leaderGauge} +} + +// SlowpathExercised returns a Prometheus counter that fulfills the CounterMetric interface +func (p PrometheusLeaderElectionMetricsProvider) SlowpathExercised() internalmetrics.CounterMetric { + return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.leaderSlowpathCounter} +} + +type leaderElectionMetricsInternalProvider struct { + provider LeaderElectionMetricsProvider +} + +func (l leaderElectionMetricsInternalProvider) NewLeaderMetric() leaderelection.LeaderMetric { + return leaderElectionMetricAdapter(l) +} + +type leaderElectionMetricAdapter struct { + provider LeaderElectionMetricsProvider +} -func (s leaderElectionPrometheusAdapter) On(name string) { - leaderGauge.WithLabelValues(name).Set(1.0) +func (l leaderElectionMetricAdapter) On(name string) { + l.provider.LeaderGauge().Set(map[string]string{"name": name}, 1) } -func (s leaderElectionPrometheusAdapter) Off(name string) { - leaderGauge.WithLabelValues(name).Set(0.0) +func (l leaderElectionMetricAdapter) Off(name string) { + l.provider.LeaderGauge().Set(map[string]string{"name": name}, 0) } -func (leaderElectionPrometheusAdapter) SlowpathExercised(name string) { - leaderSlowpathCounter.WithLabelValues(name).Inc() +func (l leaderElectionMetricAdapter) SlowpathExercised(name string) { + l.provider.SlowpathExercised().Inc(map[string]string{"name": name}) } diff --git a/pkg/metrics/registry.go b/pkg/metrics/registry.go index ce17124d53..000bba00f8 100644 --- a/pkg/metrics/registry.go +++ b/pkg/metrics/registry.go @@ -16,7 +16,10 @@ limitations under the License. package metrics -import "github.com/prometheus/client_golang/prometheus" +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" +) // RegistererGatherer combines both parts of the API of a Prometheus // registry, both the Registerer and the Gatherer interfaces. @@ -28,3 +31,11 @@ type RegistererGatherer interface { // Registry is a prometheus registry for storing metrics within the // controller-runtime. var Registry RegistererGatherer = prometheus.NewRegistry() + +func init() { + Registry.MustRegister( // expose process metrics like CPU, Memory, file descriptor usage etc. + collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), + // expose all Go runtime metrics like GC stats, memory stats etc. + collectors.NewGoCollector(collectors.WithGoCollectorRuntimeMetrics(collectors.MetricsAll)), + ) +} diff --git a/pkg/metrics/workqueue.go b/pkg/metrics/workqueue.go index cd7ccc773e..b606caa13f 100644 --- a/pkg/metrics/workqueue.go +++ b/pkg/metrics/workqueue.go @@ -16,6 +16,18 @@ limitations under the License. package metrics +import ( + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/client-go/util/workqueue" +) + +// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/workqueue +// which registers metrics to the k8s legacy Registry. We require very +// similar functionality, but must register metrics to a different Registry. + // Metrics subsystem and all keys used by the workqueue. const ( WorkQueueSubsystem = "workqueue" @@ -27,3 +39,163 @@ const ( LongestRunningProcessorKey = "longest_running_processor_seconds" RetriesKey = "retries_total" ) + +var ( + depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: DepthKey, + Help: "Current depth of workqueue by workqueue and priority", + }, []string{"name", "controller", "priority"}) + + adds = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: WorkQueueSubsystem, + Name: AddsKey, + Help: "Total number of adds handled by workqueue", + }, []string{"name", "controller"}) + + latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: WorkQueueSubsystem, + Name: QueueLatencyKey, + Help: "How long in seconds an item stays in workqueue before being requested", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"name", "controller"}) + + workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: WorkQueueSubsystem, + Name: WorkDurationKey, + Help: "How long in seconds processing an item from workqueue takes.", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"name", "controller"}) + + unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: UnfinishedWorkKey, + Help: "How many seconds of work has been done that " + + "is in progress and hasn't been observed by work_duration. Large " + + "values indicate stuck threads. One can deduce the number of stuck " + + "threads by observing the rate at which this increases.", + }, []string{"name", "controller"}) + + longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: LongestRunningProcessorKey, + Help: "How many seconds has the longest running " + + "processor for workqueue been running.", + }, []string{"name", "controller"}) + + retries = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: WorkQueueSubsystem, + Name: RetriesKey, + Help: "Total number of retries handled by workqueue", + }, []string{"name", "controller"}) +) + +func init() { + Registry.MustRegister( + depth, + adds, + latency, + workDuration, + unfinished, + longestRunningProcessor, + retries, + ) + SetWorkqueueMetricsProvider(NewPrometheusWorkqueueMetricsProvider()) +} + +var workqueueMetricsProvider workqueue.MetricsProvider + +// SetWorkqueueMetricsProvider sets the workqueue metrics provider leveraged by client-go +func SetWorkqueueMetricsProvider(provider workqueue.MetricsProvider) { + workqueueMetricsProvider = provider + workqueue.SetProvider(provider) +} + +// GetWorkqueueMetricsProvider returns the workqueue metrics provider +func GetWorkqueueMetricsProvider() workqueue.MetricsProvider { + return workqueueMetricsProvider +} + +// PrometheusWorkqueueMetricsProvider implements the metrics provider for exposing workqueue metrics from client-go +type PrometheusWorkqueueMetricsProvider struct{} + +// NewPrometheusWorkqueueMetricsProvider returns a new PrometheusWorkqueueMetricsProvider +func NewPrometheusWorkqueueMetricsProvider() *PrometheusWorkqueueMetricsProvider { + return &PrometheusWorkqueueMetricsProvider{} +} + +// NewDepthMetric creates a Gauge metric from the depth GaugeVec +func (PrometheusWorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric { + return depth.WithLabelValues(name, name, "") // no priority +} + +// NewAddsMetric creates a Counter metric from the adds CounterVec +func (PrometheusWorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric { + return adds.WithLabelValues(name, name) +} + +// NewLatencyMetric creates a Histogram metric from the latency HistogramVec +func (PrometheusWorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric { + return latency.WithLabelValues(name, name) +} + +// NewWorkDurationMetric creates a Histogram metric from the workDuration HistogramVec +func (PrometheusWorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric { + return workDuration.WithLabelValues(name, name) +} + +// NewUnfinishedWorkSecondsMetric creates a Gauge metric from the unfinished GaugeVec +func (PrometheusWorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric { + return unfinished.WithLabelValues(name, name) +} + +// NewLongestRunningProcessorSecondsMetric creates a Gauge metric from the longestRunningProcessor GaugeVec +func (PrometheusWorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric { + return longestRunningProcessor.WithLabelValues(name, name) +} + +// NewRetriesMetric creates a Counter metric from the retries CounterVec +func (PrometheusWorkqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric { + return retries.WithLabelValues(name, name) +} + +// MetricsProviderWithPriority implements the MetricsProvider and adds a NewDepthMetricWithPriority interface method +// +//nolint:revive +type MetricsProviderWithPriority interface { + workqueue.MetricsProvider + + // NewDepthMetricWithPriority creates a implementation of DepthMetricWithPriority + NewDepthMetricWithPriority(name string) DepthMetricWithPriority +} + +// DepthMetricWithPriority represents a depth metric with priority. +type DepthMetricWithPriority interface { + Inc(priority int) + Dec(priority int) +} + +var _ MetricsProviderWithPriority = PrometheusWorkqueueMetricsProvider{} + +// NewDepthMetricWithPriority returns a DepthMetricWithPriority from the WorkqueueMetricsProvider +func (PrometheusWorkqueueMetricsProvider) NewDepthMetricWithPriority(name string) DepthMetricWithPriority { + return &depthWithPriorityMetric{lvs: []string{name, name}} +} + +type depthWithPriorityMetric struct { + lvs []string +} + +func (g *depthWithPriorityMetric) Inc(priority int) { + depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Inc() +} + +func (g *depthWithPriorityMetric) Dec(priority int) { + depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Dec() +}