Skip to content
This repository was archived by the owner on Oct 21, 2020. It is now read-only.

Commit 732d6e7

Browse files
author
Matthew Wong
authored
Merge pull request #796 from cofyc/metrics-server
Add metrics server support for provision controller
2 parents 559ed29 + 58b4e13 commit 732d6e7

File tree

2 files changed

+187
-5
lines changed

2 files changed

+187
-5
lines changed

lib/controller/controller.go

Lines changed: 102 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,29 @@ package controller
1818

1919
import (
2020
"fmt"
21+
"net"
22+
"net/http"
2123
"os/exec"
2224
"reflect"
25+
"strconv"
2326
"strings"
2427
"sync"
2528
"time"
2629

2730
"github.com/golang/glog"
31+
"github.com/kubernetes-incubator/external-storage/lib/controller/metrics"
2832
"github.com/kubernetes-incubator/external-storage/lib/leaderelection"
2933
rl "github.com/kubernetes-incubator/external-storage/lib/leaderelection/resourcelock"
34+
"github.com/prometheus/client_golang/prometheus"
35+
"github.com/prometheus/client_golang/prometheus/promhttp"
3036
"k8s.io/api/core/v1"
3137
storage "k8s.io/api/storage/v1"
3238
storagebeta "k8s.io/api/storage/v1beta1"
3339
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3440
"k8s.io/apimachinery/pkg/runtime"
3541
"k8s.io/apimachinery/pkg/types"
3642
"k8s.io/apimachinery/pkg/util/uuid"
43+
"k8s.io/apimachinery/pkg/util/wait"
3744
"k8s.io/apimachinery/pkg/watch"
3845
"k8s.io/client-go/kubernetes"
3946
"k8s.io/client-go/kubernetes/scheme"
@@ -122,6 +129,13 @@ type ProvisionController struct {
122129
failedProvisionStats, failedDeleteStats map[types.UID]int
123130
failedProvisionStatsMutex, failedDeleteStatsMutex *sync.Mutex
124131

132+
// The port for metrics server to serve on.
133+
metricsPort int32
134+
// The IP address for metrics server to serve on.
135+
metricsAddress string
136+
// The path of metrics endpoint path.
137+
metricsPath string
138+
125139
// Parameters of leaderelection.LeaderElectionConfig. Leader election is for
126140
// when multiple controllers are running: they race to lock (lead) every PVC
127141
// so that only one calls Provision for it (saving API calls, CPU cycles...)
@@ -156,6 +170,12 @@ const (
156170
DefaultRetryPeriod = 2 * time.Second
157171
// DefaultTermLimit is used when option function TermLimit is omitted
158172
DefaultTermLimit = 30 * time.Second
173+
// DefaultMetricsPort is used when option function MetricsPort is omitted
174+
DefaultMetricsPort = 0
175+
// DefaultMetricsAddress is used when option function MetricsAddress is omitted
176+
DefaultMetricsAddress = "0.0.0.0"
177+
// DefaultMetricsPath is used when option function MetricsPath is omitted
178+
DefaultMetricsPath = "/metrics"
159179
)
160180

161181
var errRuntime = fmt.Errorf("cannot call option functions after controller has Run")
@@ -316,6 +336,39 @@ func ClassesInformer(informer cache.SharedInformer) func(*ProvisionController) e
316336
}
317337
}
318338

339+
// MetricsPort sets the port that metrics server serves on. Default: 0, set to non-zero to enable.
340+
func MetricsPort(metricsPort int32) func(*ProvisionController) error {
341+
return func(c *ProvisionController) error {
342+
if c.HasRun() {
343+
return errRuntime
344+
}
345+
c.metricsPort = metricsPort
346+
return nil
347+
}
348+
}
349+
350+
// MetricsAddress sets the ip address that metrics serve serves on.
351+
func MetricsAddress(metricsAddress string) func(*ProvisionController) error {
352+
return func(c *ProvisionController) error {
353+
if c.HasRun() {
354+
return errRuntime
355+
}
356+
c.metricsAddress = metricsAddress
357+
return nil
358+
}
359+
}
360+
361+
// MetricsPath sets the endpoint path of metrics server.
362+
func MetricsPath(metricsPath string) func(*ProvisionController) error {
363+
return func(c *ProvisionController) error {
364+
if c.HasRun() {
365+
return errRuntime
366+
}
367+
c.metricsPath = metricsPath
368+
return nil
369+
}
370+
}
371+
319372
// NewProvisionController creates a new provision controller using
320373
// the given configuration parameters and with private (non-shared) informers.
321374
func NewProvisionController(
@@ -360,6 +413,9 @@ func NewProvisionController(
360413
renewDeadline: DefaultRenewDeadline,
361414
retryPeriod: DefaultRetryPeriod,
362415
termLimit: DefaultTermLimit,
416+
metricsPort: DefaultMetricsPort,
417+
metricsAddress: DefaultMetricsAddress,
418+
metricsPath: DefaultMetricsPath,
363419
leaderElectors: make(map[types.UID]*leaderelection.LeaderElector),
364420
leaderElectorsMutex: &sync.Mutex{},
365421
hasRun: false,
@@ -493,6 +549,25 @@ func (ctrl *ProvisionController) Run(stopCh <-chan struct{}) {
493549
ctrl.hasRunLock.Lock()
494550
ctrl.hasRun = true
495551
ctrl.hasRunLock.Unlock()
552+
if ctrl.metricsPort > 0 {
553+
prometheus.MustRegister([]prometheus.Collector{
554+
metrics.PersistentVolumeClaimProvisionTotal,
555+
metrics.PersistentVolumeClaimProvisionFailedTotal,
556+
metrics.PersistentVolumeClaimProvisionDurationSeconds,
557+
metrics.PersistentVolumeDeleteTotal,
558+
metrics.PersistentVolumeDeleteFailedTotal,
559+
metrics.PersistentVolumeDeleteDurationSeconds,
560+
}...)
561+
http.Handle(ctrl.metricsPath, promhttp.Handler())
562+
address := net.JoinHostPort(ctrl.metricsAddress, strconv.FormatInt(int64(ctrl.metricsPort), 10))
563+
glog.Infof("Starting metrics server at %s\n", address)
564+
go wait.Forever(func() {
565+
err := http.ListenAndServe(address, nil)
566+
if err != nil {
567+
glog.Errorf("Failed to listen on %s: %v", address, err)
568+
}
569+
}, 5*time.Second)
570+
}
496571
go ctrl.claimController.Run(stopCh)
497572
go ctrl.volumeController.Run(stopCh)
498573
go ctrl.classController.Run(stopCh)
@@ -536,8 +611,9 @@ func (ctrl *ProvisionController) addClaim(obj interface{}) {
536611
if ok && le.IsLeader() {
537612
opName := fmt.Sprintf("provision-%s[%s]", claimToClaimKey(claim), string(claim.UID))
538613
ctrl.scheduleOperation(opName, func() error {
614+
startTime := time.Now()
539615
err := ctrl.provisionClaimOperation(claim)
540-
ctrl.updateProvisionStats(claim, err)
616+
ctrl.updateProvisionStats(claim, err, startTime)
541617
return err
542618
})
543619
} else {
@@ -596,8 +672,9 @@ func (ctrl *ProvisionController) updateVolume(oldObj, newObj interface{}) {
596672
if ctrl.shouldDelete(volume) {
597673
opName := fmt.Sprintf("delete-%s[%s]", volume.Name, string(volume.UID))
598674
ctrl.scheduleOperation(opName, func() error {
675+
startTime := time.Now()
599676
err := ctrl.deleteVolumeOperation(volume)
600-
ctrl.updateDeleteStats(volume, err)
677+
ctrl.updateDeleteStats(volume, err, startTime)
601678
return err
602679
})
603680
}
@@ -741,8 +818,9 @@ func (ctrl *ProvisionController) lockProvisionClaimOperation(claim *v1.Persisten
741818
OnStartedLeading: func(_ <-chan struct{}) {
742819
opName := fmt.Sprintf("provision-%s[%s]", claimToClaimKey(claim), string(claim.UID))
743820
ctrl.scheduleOperation(opName, func() error {
821+
startTime := time.Now()
744822
err := ctrl.provisionClaimOperation(claim)
745-
ctrl.updateProvisionStats(claim, err)
823+
ctrl.updateProvisionStats(claim, err, startTime)
746824
return err
747825
})
748826
},
@@ -785,10 +863,21 @@ func (ctrl *ProvisionController) lockProvisionClaimOperation(claim *v1.Persisten
785863
ctrl.leaderElectorsMutex.Unlock()
786864
}
787865

788-
func (ctrl *ProvisionController) updateProvisionStats(claim *v1.PersistentVolumeClaim, err error) {
866+
func (ctrl *ProvisionController) updateProvisionStats(claim *v1.PersistentVolumeClaim, err error, startTime time.Time) {
789867
ctrl.failedProvisionStatsMutex.Lock()
790868
defer ctrl.failedProvisionStatsMutex.Unlock()
791869

870+
class := ""
871+
if claim.Spec.StorageClassName != nil {
872+
class = *claim.Spec.StorageClassName
873+
}
874+
if err != nil {
875+
metrics.PersistentVolumeClaimProvisionFailedTotal.WithLabelValues(class).Inc()
876+
} else {
877+
metrics.PersistentVolumeClaimProvisionDurationSeconds.WithLabelValues(class).Observe(time.Since(startTime).Seconds())
878+
metrics.PersistentVolumeClaimProvisionTotal.WithLabelValues(class).Inc()
879+
}
880+
792881
// Do not record the failed claim info when failedProvisionThreshold is not set
793882
if ctrl.failedProvisionThreshold <= 0 {
794883
return
@@ -806,10 +895,18 @@ func (ctrl *ProvisionController) updateProvisionStats(claim *v1.PersistentVolume
806895
}
807896
}
808897

809-
func (ctrl *ProvisionController) updateDeleteStats(volume *v1.PersistentVolume, err error) {
898+
func (ctrl *ProvisionController) updateDeleteStats(volume *v1.PersistentVolume, err error, startTime time.Time) {
810899
ctrl.failedDeleteStatsMutex.Lock()
811900
defer ctrl.failedDeleteStatsMutex.Unlock()
812901

902+
class := volume.Spec.StorageClassName
903+
if err != nil {
904+
metrics.PersistentVolumeDeleteFailedTotal.WithLabelValues(class).Inc()
905+
} else {
906+
metrics.PersistentVolumeDeleteDurationSeconds.WithLabelValues(class).Observe(time.Since(startTime).Seconds())
907+
metrics.PersistentVolumeDeleteTotal.WithLabelValues(class).Inc()
908+
}
909+
813910
// Do not record the failed volume info when failedDeleteThreshold is not set
814911
if ctrl.failedDeleteThreshold <= 0 {
815912
return

lib/controller/metrics/metrics.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
)
22+
23+
const (
24+
// ControllerSubsystem is prometheus subsystem name.
25+
ControllerSubsystem = "controller"
26+
)
27+
28+
var (
29+
// PersistentVolumeClaimProvisionTotal is used to collect accumulated count of persistent volumes provisioned.
30+
PersistentVolumeClaimProvisionTotal = prometheus.NewCounterVec(
31+
prometheus.CounterOpts{
32+
Subsystem: ControllerSubsystem,
33+
Name: "persistentvolumeclaim_provision_total",
34+
Help: "Total number of persistent volumes provisioned. Broken down by storage class name.",
35+
},
36+
[]string{"class"},
37+
)
38+
// PersistentVolumeClaimProvisionFailedTotal is used to collect accumulated count of persistent volume provision failed attempts.
39+
PersistentVolumeClaimProvisionFailedTotal = prometheus.NewCounterVec(
40+
prometheus.CounterOpts{
41+
Subsystem: ControllerSubsystem,
42+
Name: "persistentvolumeclaim_provision_failed_total",
43+
Help: "Total number of persistent volume provision failed attempts. Broken down by storage class name.",
44+
},
45+
[]string{"class"},
46+
)
47+
// PersistentVolumeClaimProvisionDurationSeconds is used to collect latency in seconds to provision persistent volumes.
48+
PersistentVolumeClaimProvisionDurationSeconds = prometheus.NewHistogramVec(
49+
prometheus.HistogramOpts{
50+
Subsystem: ControllerSubsystem,
51+
Name: "persistentvolumeclaim_provision_duration_seconds",
52+
Help: "Latency in seconds to provision persistent volumes. Broken down by storage class name.",
53+
Buckets: prometheus.DefBuckets,
54+
},
55+
[]string{"class"},
56+
)
57+
// PersistentVolumeDeleteTotal is used to collect accumulated count of persistent volumes deleted.
58+
PersistentVolumeDeleteTotal = prometheus.NewCounterVec(
59+
prometheus.CounterOpts{
60+
Subsystem: ControllerSubsystem,
61+
Name: "persistentvolume_delete_total",
62+
Help: "Total number of persistent volumes deleteed. Broken down by storage class name.",
63+
},
64+
[]string{"class"},
65+
)
66+
// PersistentVolumeDeleteFailedTotal is used to collect accumulated count of persistent volume delete failed attempts.
67+
PersistentVolumeDeleteFailedTotal = prometheus.NewCounterVec(
68+
prometheus.CounterOpts{
69+
Subsystem: ControllerSubsystem,
70+
Name: "persistentvolume_delete_failed_total",
71+
Help: "Total number of persistent volume delete failed attempts. Broken down by storage class name.",
72+
},
73+
[]string{"class"},
74+
)
75+
// PersistentVolumeDeleteDurationSeconds is used to collect latency in seconds to delete persistent volumes.
76+
PersistentVolumeDeleteDurationSeconds = prometheus.NewHistogramVec(
77+
prometheus.HistogramOpts{
78+
Subsystem: ControllerSubsystem,
79+
Name: "persistentvolume_delete_duration_seconds",
80+
Help: "Latency in seconds to delete persistent volumes. Broken down by storage class name.",
81+
Buckets: prometheus.DefBuckets,
82+
},
83+
[]string{"class"},
84+
)
85+
)

0 commit comments

Comments
 (0)