Skip to content

Commit 697cbd3

Browse files
SerjKol80Sergey Kolosov
andauthored
statistics: Add 'store' label to metric pd_cluster_status. (#9898)
close #9855 Add 'store' label to metric pd_cluster_status. Signed-off-by: Sergey Kolosov <sergey.kolosov@airbnb.com> Co-authored-by: Sergey Kolosov <sergey.kolosov@airbnb.com>
1 parent 5de7eae commit 697cbd3

File tree

4 files changed

+96
-195
lines changed

4 files changed

+96
-195
lines changed

pkg/statistics/metrics.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ var (
4747
Subsystem: "cluster",
4848
Name: "status",
4949
Help: "Status of the cluster.",
50-
}, []string{"type", "engine"})
50+
}, []string{"type", "engine", "store"})
5151

5252
placementStatusGauge = prometheus.NewGaugeVec(
5353
prometheus.GaugeOpts{

pkg/statistics/store_collection.go

Lines changed: 72 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -30,116 +30,89 @@ import (
3030
const (
3131
unknown = "unknown"
3232
labelType = "label"
33-
)
34-
35-
var (
36-
// tikv status counters.
37-
tikvUpCounter = clusterStatusGauge.WithLabelValues("store_up_count", "tikv")
38-
tikvDiconnectedCounter = clusterStatusGauge.WithLabelValues("store_disconnected_count", "tikv")
39-
tikvDownCounter = clusterStatusGauge.WithLabelValues("store_down_count", "tikv")
40-
tikvUnhealthCounter = clusterStatusGauge.WithLabelValues("store_unhealth_count", "tikv")
41-
tikvOfflineCounter = clusterStatusGauge.WithLabelValues("store_offline_count", "tikv")
42-
tikvTombstoneCounter = clusterStatusGauge.WithLabelValues("store_tombstone_count", "tikv")
43-
tikvLowSpaceCounter = clusterStatusGauge.WithLabelValues("store_low_space_count", "tikv")
44-
tikvPreparingCounter = clusterStatusGauge.WithLabelValues("store_preparing_count", "tikv")
45-
tikvServingCounter = clusterStatusGauge.WithLabelValues("store_serving_count", "tikv")
46-
tikvRemovingCounter = clusterStatusGauge.WithLabelValues("store_removing_count", "tikv")
47-
tikvRemovedCounter = clusterStatusGauge.WithLabelValues("store_removed_count", "tikv")
4833

49-
// tiflash status counters.
50-
tiflashUpCounter = clusterStatusGauge.WithLabelValues("store_up_count", "tiflash")
51-
tiflashDiconnectedCounter = clusterStatusGauge.WithLabelValues("store_disconnected_count", "tiflash")
52-
tiflashDownCounter = clusterStatusGauge.WithLabelValues("store_down_count", "tiflash")
53-
tiflashUnhealthCounter = clusterStatusGauge.WithLabelValues("store_unhealth_count", "tiflash")
54-
tiflashOfflineCounter = clusterStatusGauge.WithLabelValues("store_offline_count", "tiflash")
55-
tiflashTombstoneCounter = clusterStatusGauge.WithLabelValues("store_tombstone_count", "tiflash")
56-
tiflashLowSpaceCounter = clusterStatusGauge.WithLabelValues("store_low_space_count", "tiflash")
57-
tiflashPreparingCounter = clusterStatusGauge.WithLabelValues("store_preparing_count", "tiflash")
58-
tiflashServingCounter = clusterStatusGauge.WithLabelValues("store_serving_count", "tiflash")
59-
tiflashRemovingCounter = clusterStatusGauge.WithLabelValues("store_removing_count", "tiflash")
60-
tiflashRemovedCounter = clusterStatusGauge.WithLabelValues("store_removed_count", "tiflash")
61-
62-
// Store status metrics.
63-
storeRegionCountGauge = clusterStatusGauge.WithLabelValues("region_count", "all")
64-
storeLeaderCountGauge = clusterStatusGauge.WithLabelValues("leader_count", "all")
65-
storeWitnessCountGauge = clusterStatusGauge.WithLabelValues("witness_count", "all")
66-
storeLearnerCountGauge = clusterStatusGauge.WithLabelValues("learner_count", "all")
67-
storeStorageSizeGauge = clusterStatusGauge.WithLabelValues("storage_size", "all")
68-
storeStorageCapacityGauge = clusterStatusGauge.WithLabelValues("storage_capacity", "all")
34+
clusterStatusStoreUpCount = "store_up_count"
35+
clusterStatusStoreDisconnectedCount = "store_disconnected_count"
36+
clusterStatusStoreSlowCount = "store_slow_count"
37+
clusterStatusStoreDownCount = "store_down_count"
38+
clusterStatusStoreUnhealthCount = "store_unhealth_count"
39+
clusterStatusStoreOfflineCount = "store_offline_count"
40+
clusterStatusStoreTombstoneCount = "store_tombstone_count"
41+
clusterStatusStoreLowSpaceCount = "store_low_space_count"
42+
clusterStatusStorePreparingCount = "store_preparing_count"
43+
clusterStatusStoreServingCount = "store_serving_count"
44+
clusterStatusStoreRemovingCount = "store_removing_count"
45+
clusterStatusStoreRemovedCount = "store_removed_count"
46+
47+
clusterStatusRegionCount = "region_count"
48+
clusterStatusLeaderCount = "leader_count"
49+
clusterStatusWitnessCount = "witness_count"
50+
clusterStatusLearnerCount = "learner_count"
51+
clusterStatusStorageSize = "storage_size"
52+
clusterStatusStorageCapacity = "storage_capacity"
6953
)
7054

7155
type storeStatistics struct {
72-
opt config.ConfProvider
73-
StorageSize uint64
74-
StorageCapacity uint64
75-
RegionCount int
76-
LeaderCount int
77-
LearnerCount int
78-
WitnessCount int
79-
LabelCounter map[string][]uint64
80-
81-
engineStatistics map[string]*storeStatusStatistics
56+
opt config.ConfProvider
57+
LabelCounter map[string][]uint64
8258
}
8359

84-
type storeStatusStatistics struct {
85-
opt config.ConfProvider
86-
Up int
87-
Disconnect int
88-
Unhealthy int
89-
Down int
90-
Offline int
91-
Tombstone int
92-
LowSpace int
93-
Slow int
94-
Preparing int
95-
Serving int
96-
Removing int
97-
Removed int
60+
func newStoreStatistics(opt config.ConfProvider) *storeStatistics {
61+
return &storeStatistics{
62+
opt: opt,
63+
LabelCounter: make(map[string][]uint64),
64+
}
9865
}
9966

100-
func (s *storeStatusStatistics) observe(store *core.StoreInfo) {
67+
func (s *storeStatistics) observeStoreStatus(store *core.StoreInfo) map[string]float64 {
68+
result := map[string]float64{
69+
clusterStatusStoreUpCount: 0,
70+
clusterStatusStoreDisconnectedCount: 0,
71+
clusterStatusStoreSlowCount: 0,
72+
clusterStatusStoreDownCount: 0,
73+
clusterStatusStoreUnhealthCount: 0,
74+
clusterStatusStoreOfflineCount: 0,
75+
clusterStatusStoreTombstoneCount: 0,
76+
clusterStatusStoreLowSpaceCount: 0,
77+
clusterStatusStorePreparingCount: 0,
78+
clusterStatusStoreServingCount: 0,
79+
clusterStatusStoreRemovingCount: 0,
80+
clusterStatusStoreRemovedCount: 0,
81+
}
82+
10183
// Store state.
10284
isDown := false
10385
switch store.GetNodeState() {
10486
case metapb.NodeState_Preparing, metapb.NodeState_Serving:
10587
if store.DownTime() >= s.opt.GetMaxStoreDownTime() {
10688
isDown = true
107-
s.Down++
89+
result[clusterStatusStoreDownCount]++
10890
} else if store.IsUnhealthy() {
109-
s.Unhealthy++
91+
result[clusterStatusStoreUnhealthCount]++
11092
} else if store.IsDisconnected() {
111-
s.Disconnect++
93+
result[clusterStatusStoreDisconnectedCount]++
11294
} else if store.IsSlow() {
113-
s.Slow++
95+
result[clusterStatusStoreSlowCount]++
11496
} else {
115-
s.Up++
97+
result[clusterStatusStoreUpCount]++
11698
}
11799
if store.IsPreparing() {
118-
s.Preparing++
100+
result[clusterStatusStorePreparingCount]++
119101
} else {
120-
s.Serving++
102+
result[clusterStatusStoreServingCount]++
121103
}
122104
case metapb.NodeState_Removing:
123-
s.Offline++
124-
s.Removing++
105+
result[clusterStatusStoreOfflineCount]++
106+
result[clusterStatusStoreRemovingCount]++
125107
case metapb.NodeState_Removed:
126-
s.Tombstone++
127-
s.Removed++
128-
return
108+
result[clusterStatusStoreTombstoneCount]++
109+
result[clusterStatusStoreRemovedCount]++
110+
return result
129111
}
130112
if !isDown && store.IsLowSpace(s.opt.GetLowSpaceRatio()) {
131-
s.LowSpace++
132-
}
133-
}
134-
135-
func newStoreStatistics(opt config.ConfProvider) *storeStatistics {
136-
statistics := make(map[string]*storeStatusStatistics, 1)
137-
statistics[core.EngineTiKV] = &storeStatusStatistics{opt: opt}
138-
return &storeStatistics{
139-
opt: opt,
140-
LabelCounter: make(map[string][]uint64),
141-
engineStatistics: statistics,
113+
result[clusterStatusStoreLowSpaceCount]++
142114
}
115+
return result
143116
}
144117

145118
func (s *storeStatistics) observe(store *core.StoreInfo) {
@@ -156,31 +129,28 @@ func (s *storeStatistics) observe(store *core.StoreInfo) {
156129
}
157130
storeAddress := store.GetAddress()
158131
id := strconv.FormatUint(store.GetID(), 10)
159-
// Store state.
160-
var statistics *storeStatusStatistics
161-
if !store.IsTiKV() {
162-
statistics = s.engineStatistics[core.EngineTiFlash]
163-
if statistics == nil {
164-
s.engineStatistics[core.EngineTiFlash] = &storeStatusStatistics{opt: s.opt}
165-
statistics = s.engineStatistics[core.EngineTiFlash]
166-
}
132+
var engine string
133+
if store.IsTiKV() {
134+
engine = core.EngineTiKV
167135
} else {
168-
// tikv statistics has been initialized in newStoreStatistics.
169-
statistics = s.engineStatistics[core.EngineTiKV]
136+
engine = core.EngineTiFlash
137+
}
138+
storeStatusStats := s.observeStoreStatus(store)
139+
for statusType, value := range storeStatusStats {
140+
clusterStatusGauge.WithLabelValues(statusType, engine, id).Set(value)
170141
}
171-
statistics.observe(store)
172142
// skip tombstone store avoid to overwrite metrics
173143
if store.GetNodeState() == metapb.NodeState_Removed {
174144
return
175145
}
176146

177147
// Store stats.
178-
s.StorageSize += store.StorageSize()
179-
s.StorageCapacity += store.GetCapacity()
180-
s.RegionCount += store.GetRegionCount()
181-
s.LeaderCount += store.GetLeaderCount()
182-
s.WitnessCount += store.GetWitnessCount()
183-
s.LearnerCount += store.GetLearnerCount()
148+
clusterStatusGauge.WithLabelValues(clusterStatusStorageSize, engine, id).Set(float64(store.StorageSize()))
149+
clusterStatusGauge.WithLabelValues(clusterStatusStorageCapacity, engine, id).Set(float64(store.GetCapacity()))
150+
clusterStatusGauge.WithLabelValues(clusterStatusRegionCount, engine, id).Set(float64(store.GetRegionCount()))
151+
clusterStatusGauge.WithLabelValues(clusterStatusLeaderCount, engine, id).Set(float64(store.GetLeaderCount()))
152+
clusterStatusGauge.WithLabelValues(clusterStatusWitnessCount, engine, id).Set(float64(store.GetWitnessCount()))
153+
clusterStatusGauge.WithLabelValues(clusterStatusLearnerCount, engine, id).Set(float64(store.GetLearnerCount()))
184154
limit, ok := store.GetStoreLimit().(*storelimit.SlidingWindows)
185155
if ok {
186156
cap := limit.GetCap()
@@ -247,46 +217,6 @@ func ObserveHotStat(store *core.StoreInfo, stats *StoresStats) {
247217
func (s *storeStatistics) collect() {
248218
placementStatusGauge.Reset()
249219

250-
// tikv store status metrics.
251-
tikvStatistics, ok := s.engineStatistics[core.EngineTiKV]
252-
if ok {
253-
tikvUpCounter.Set(float64(tikvStatistics.Up))
254-
tikvDiconnectedCounter.Set(float64(tikvStatistics.Disconnect))
255-
tikvDownCounter.Set(float64(tikvStatistics.Down))
256-
tikvUnhealthCounter.Set(float64(tikvStatistics.Unhealthy))
257-
tikvOfflineCounter.Set(float64(tikvStatistics.Offline))
258-
tikvTombstoneCounter.Set(float64(tikvStatistics.Tombstone))
259-
tikvLowSpaceCounter.Set(float64(tikvStatistics.LowSpace))
260-
tikvPreparingCounter.Set(float64(tikvStatistics.Preparing))
261-
tikvServingCounter.Set(float64(tikvStatistics.Serving))
262-
tikvRemovingCounter.Set(float64(tikvStatistics.Removing))
263-
tikvRemovedCounter.Set(float64(tikvStatistics.Removed))
264-
}
265-
266-
// tiflash store status metrics.
267-
tiflashStatistics, ok := s.engineStatistics[core.EngineTiFlash]
268-
if ok {
269-
tiflashUpCounter.Set(float64(tiflashStatistics.Up))
270-
tiflashDiconnectedCounter.Set(float64(tiflashStatistics.Disconnect))
271-
tiflashDownCounter.Set(float64(tiflashStatistics.Down))
272-
tiflashUnhealthCounter.Set(float64(tiflashStatistics.Unhealthy))
273-
tiflashOfflineCounter.Set(float64(tiflashStatistics.Offline))
274-
tiflashTombstoneCounter.Set(float64(tiflashStatistics.Tombstone))
275-
tiflashLowSpaceCounter.Set(float64(tiflashStatistics.LowSpace))
276-
tiflashPreparingCounter.Set(float64(tiflashStatistics.Preparing))
277-
tiflashServingCounter.Set(float64(tiflashStatistics.Serving))
278-
tiflashRemovingCounter.Set(float64(tiflashStatistics.Removing))
279-
tiflashRemovedCounter.Set(float64(tiflashStatistics.Removed))
280-
}
281-
282-
// Store status metrics.
283-
storeRegionCountGauge.Set(float64(s.RegionCount))
284-
storeLeaderCountGauge.Set(float64(s.LeaderCount))
285-
storeWitnessCountGauge.Set(float64(s.WitnessCount))
286-
storeLearnerCountGauge.Set(float64(s.LearnerCount))
287-
storeStorageSizeGauge.Set(float64(s.StorageSize))
288-
storeStorageCapacityGauge.Set(float64(s.StorageCapacity))
289-
290220
// Current scheduling configurations of the cluster
291221
configs := make(map[string]float64)
292222
configs["leader-schedule-limit"] = float64(s.opt.GetLeaderScheduleLimit())
@@ -374,6 +304,7 @@ func ResetStoreStatistics(storeAddress string, id string) {
374304
for _, m := range metrics {
375305
storeStatusGauge.DeleteLabelValues(storeAddress, id, m)
376306
}
307+
clusterStatusGauge.DeletePartialMatch(utils.SingleLabel("store", id))
377308
}
378309

379310
type storeStatisticsMap struct {
@@ -403,44 +334,8 @@ func (m *storeStatisticsMap) Collect() {
403334
func Reset() {
404335
storeStatusGauge.Reset()
405336
placementStatusGauge.Reset()
406-
ResetClusterStatusMetrics()
337+
clusterStatusGauge.Reset()
407338
ResetRegionStatsMetrics()
408339
ResetLabelStatsMetrics()
409340
ResetHotCacheStatusMetrics()
410341
}
411-
412-
// ResetClusterStatusMetrics resets the cluster status metrics.
413-
func ResetClusterStatusMetrics() {
414-
tikvUpCounter.Set(0)
415-
tikvDiconnectedCounter.Set(0)
416-
tikvDownCounter.Set(0)
417-
tikvUnhealthCounter.Set(0)
418-
tikvOfflineCounter.Set(0)
419-
tikvTombstoneCounter.Set(0)
420-
tikvLowSpaceCounter.Set(0)
421-
tikvPreparingCounter.Set(0)
422-
tikvServingCounter.Set(0)
423-
tikvRemovingCounter.Set(0)
424-
tikvRemovedCounter.Set(0)
425-
426-
// tiflash status counters.
427-
tiflashUpCounter.Set(0)
428-
tiflashDiconnectedCounter.Set(0)
429-
tiflashDownCounter.Set(0)
430-
tiflashUnhealthCounter.Set(0)
431-
tiflashOfflineCounter.Set(0)
432-
tiflashTombstoneCounter.Set(0)
433-
tiflashLowSpaceCounter.Set(0)
434-
tiflashPreparingCounter.Set(0)
435-
tiflashServingCounter.Set(0)
436-
tiflashRemovingCounter.Set(0)
437-
tiflashRemovedCounter.Set(0)
438-
439-
// Store status metrics.
440-
storeRegionCountGauge.Set(0)
441-
storeLeaderCountGauge.Set(0)
442-
storeWitnessCountGauge.Set(0)
443-
storeLearnerCountGauge.Set(0)
444-
storeStorageSizeGauge.Set(0)
445-
storeStorageCapacityGauge.Set(0)
446-
}

pkg/statistics/store_collection_test.go

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -76,22 +76,7 @@ func TestStoreStatistics(t *testing.T) {
7676
ObserveHotStat(store, storesStats)
7777
}
7878
stats := storeStats.stats
79-
tikvStats := stats.engineStatistics[core.EngineTiKV]
80-
81-
re.Equal(6, tikvStats.Up)
82-
re.Equal(7, tikvStats.Preparing)
83-
re.Equal(0, tikvStats.Serving)
84-
re.Equal(1, tikvStats.Removing)
85-
re.Equal(1, tikvStats.Removed)
86-
re.Equal(1, tikvStats.Down)
87-
re.Equal(1, tikvStats.Offline)
88-
re.Equal(0, stats.RegionCount)
89-
re.Equal(0, stats.WitnessCount)
90-
re.Equal(0, tikvStats.Unhealthy)
91-
re.Equal(0, tikvStats.Disconnect)
92-
re.Equal(1, tikvStats.Tombstone)
93-
re.Equal(1, tikvStats.LowSpace)
94-
re.Equal(1, stats.engineStatistics[core.EngineTiFlash].Up)
79+
9580
re.Len(stats.LabelCounter["zone:z1"], 2)
9681
re.Equal([]uint64{1, 2}, stats.LabelCounter["zone:z1"])
9782
re.Len(stats.LabelCounter["zone:z2"], 2)
@@ -100,7 +85,6 @@ func TestStoreStatistics(t *testing.T) {
10085
re.Equal([]uint64{1, 3, 5, 7}, stats.LabelCounter["host:h1"])
10186
re.Len(stats.LabelCounter["host:h2"], 4)
10287
re.Len(stats.LabelCounter["zone:unknown"], 2)
103-
re.Equal(0, stats.LeaderCount)
10488
}
10589

10690
func TestSummaryStoreInfos(t *testing.T) {

pkg/statistics/utils/labels.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright 2020 TiKV Project Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package utils
16+
17+
import "github.com/prometheus/client_golang/prometheus"
18+
19+
// SingleLabel build a labels map containing only a single label
20+
func SingleLabel(key, value string) prometheus.Labels {
21+
return prometheus.Labels{key: value}
22+
}

0 commit comments

Comments
 (0)