Skip to content

Commit 1990573

Browse files
committed
improve alert
Signed-off-by: Ryan Leung <rleungx@gmail.com>
1 parent 8bb1197 commit 1990573

File tree

15 files changed

+2011
-715
lines changed

15 files changed

+2011
-715
lines changed

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,6 @@ SHELL := env PATH='$(PATH)' GOBIN='$(GO_TOOLS_BIN_PATH)' $(shell which bash)
180180

181181
install-tools:
182182
@mkdir -p $(GO_TOOLS_BIN_PATH)
183-
@which golangci-lint >/dev/null 2>&1 || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GO_TOOLS_BIN_PATH) v2.6.0
184183
@grep '_' tools.go | sed 's/"//g' | awk '{print $$2}' | xargs go install
185184

186185
.PHONY: install-tools
@@ -197,6 +196,9 @@ static: install-tools pre-build
197196
@ for mod in $(SUBMODULES); do cd $$mod && $(MAKE) static && cd $(ROOT_PATH) > /dev/null; done
198197
@ echo "leakcheck ..."
199198
@ leakcheck -exclude-files="tests/server/join/join_test.go" $(PACKAGES)
199+
@ echo "promtool ..."
200+
@ promtool check rules metrics/alertmanager/pd.rules.yml
201+
@ promtool test rules metrics/alertmanager/pd.rules.test.yml
200202

201203

202204
# Because CI downloads the dashboard code and runs gofmt, we can't add this check into static now.

cmd/pd-server/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ func main() {
6565
addFlags(rootCmd)
6666
rootCmd.AddCommand(NewServiceCommand())
6767

68-
rootCmd.SetOutput(os.Stdout)
68+
rootCmd.SetOut(os.Stdout)
69+
rootCmd.SetErr(os.Stdout)
6970
if err := rootCmd.Execute(); err != nil {
7071
rootCmd.Println(err)
7172
exit(1)

go.mod

Lines changed: 377 additions & 74 deletions
Large diffs are not rendered by default.

go.sum

Lines changed: 995 additions & 158 deletions
Large diffs are not rendered by default.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
rule_files:
2+
- pd.rules.yml
3+
4+
# Use 15s to match typical scrape interval.
5+
evaluation_interval: 15s
6+
7+
tests:
8+
- interval: 15s
9+
name: pd-leader-lease-drop-without-failover
10+
input_series:
11+
# PD leader metric (service_member_role) flaps on pd-1: 1 -> 0 -> 1.
12+
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
13+
# 20m total at 15s step: 80 samples. Drop at minute 5 for 1m.
14+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
15+
16+
# Embedded etcd leader stays stable on pd-1.
17+
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
18+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
19+
20+
# No PD restarts in the window.
21+
- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
22+
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100'
23+
24+
alert_rule_test:
25+
- eval_time: 12m
26+
alertname: PD_leader_lease_drop_without_failover
27+
exp_alerts:
28+
- exp_labels:
29+
env: ENV_LABELS_ENV
30+
level: warning
31+
job: pd
32+
service: PD
33+
instance: pd-1
34+
expr: '(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)'
35+
exp_annotations:
36+
summary: 'PD leader lease dropped without failover'
37+
description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:2'
38+
value: '2'
39+
40+
- interval: 15s
41+
name: pd-leader-lease-drop-suppressed-by-leader-restart
42+
input_series:
43+
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
44+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
45+
46+
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
47+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
48+
49+
# pd-1 restarted within 15m (start_time changes).
50+
- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
51+
# Change at minute 9.
52+
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500'
53+
54+
alert_rule_test:
55+
- eval_time: 12m
56+
alertname: PD_leader_lease_drop_without_failover
57+
exp_alerts: []
58+
59+
- interval: 15s
60+
name: pd-leader-lease-drop-not-suppressed-by-follower-restart
61+
input_series:
62+
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
63+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
64+
65+
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
66+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
67+
68+
- series: 'etcd_server_is_leader{job="pd",instance="pd-2"}'
69+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
70+
71+
- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
72+
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100'
73+
74+
# Follower restart should NOT suppress when embedded etcd leader is stable.
75+
- series: 'process_start_time_seconds{job="pd",instance="pd-2"}'
76+
# Change at minute 10.
77+
values: '200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600'
78+
79+
alert_rule_test:
80+
- eval_time: 12m
81+
alertname: PD_leader_lease_drop_without_failover
82+
exp_alerts:
83+
- exp_labels:
84+
env: ENV_LABELS_ENV
85+
level: warning
86+
job: pd
87+
service: PD
88+
instance: pd-1
89+
expr: '(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)'
90+
exp_annotations:
91+
summary: 'PD leader lease dropped without failover'
92+
description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:2'
93+
value: '2'
94+
95+
- interval: 15s
96+
name: pd-leader-lease-drop-suppressed-by-etcd-leader-change
97+
input_series:
98+
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
99+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
100+
101+
# Embedded etcd leader changes from pd-1 to pd-2 at minute 6.
102+
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
103+
# minute 0-6: 1 (24 samples), after: 0
104+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
105+
- series: 'etcd_server_is_leader{job="pd",instance="pd-2"}'
106+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
107+
108+
- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
109+
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100'
110+
- series: 'process_start_time_seconds{job="pd",instance="pd-2"}'
111+
values: '200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200'
112+
113+
alert_rule_test:
114+
- eval_time: 12m
115+
alertname: PD_leader_lease_drop_without_failover
116+
exp_alerts: []

metrics/alertmanager/pd.rules.yml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,29 @@ groups:
130130
labels:
131131
env: ENV_LABELS_ENV
132132
level: warning
133-
expr: count( changes(pd_tso_events{type="save"}[10m]) > 0 ) >= 2
133+
expr: count( changes(pd_tso_events{type="save"}[10m]) > 0 ) >= 2
134134
annotations:
135135
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
136136
value: '{{ $value }}'
137137
summary: PD_leader_change
138138

139+
- alert: PD_leader_lease_drop_without_failover
140+
expr: |
141+
(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2)
142+
and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0)
143+
and (service_member_role{job="pd",service="PD"} == 1)
144+
and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0)
145+
and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)
146+
for: 1m
147+
labels:
148+
env: ENV_LABELS_ENV
149+
level: warning
150+
expr: (changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)
151+
annotations:
152+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:{{ $value }}'
153+
value: '{{ $value }}'
154+
summary: PD leader lease dropped without failover
155+
139156
- alert: PD_cluster_store_space_used_more_than_80%
140157
expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80
141158
for: 1m

0 commit comments

Comments
 (0)