Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ SHELL := env PATH='$(PATH)' GOBIN='$(GO_TOOLS_BIN_PATH)' $(shell which bash)

install-tools:
@mkdir -p $(GO_TOOLS_BIN_PATH)
@which golangci-lint >/dev/null 2>&1 || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GO_TOOLS_BIN_PATH) v2.6.0
@grep '_' tools.go | sed 's/"//g' | awk '{print $$2}' | xargs go install

.PHONY: install-tools
Expand All @@ -197,6 +196,9 @@ static: install-tools pre-build
@ for mod in $(SUBMODULES); do cd $$mod && $(MAKE) static && cd $(ROOT_PATH) > /dev/null; done
@ echo "leakcheck ..."
@ leakcheck -exclude-files="tests/server/join/join_test.go" $(PACKAGES)
@ echo "promtool ..."
@ promtool check rules metrics/alertmanager/pd.rules.yml
@ promtool test rules metrics/alertmanager/pd.rules.test.yml


# Because CI downloads the dashboard code and runs gofmt, we can't add this check into static now.
Expand Down
3 changes: 2 additions & 1 deletion cmd/pd-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ func main() {
addFlags(rootCmd)
rootCmd.AddCommand(NewServiceCommand())

rootCmd.SetOutput(os.Stdout)
rootCmd.SetOut(os.Stdout)
rootCmd.SetErr(os.Stdout)
if err := rootCmd.Execute(); err != nil {
rootCmd.Println(err)
exit(1)
Expand Down
451 changes: 377 additions & 74 deletions go.mod

Large diffs are not rendered by default.

1,153 changes: 995 additions & 158 deletions go.sum

Large diffs are not rendered by default.

116 changes: 116 additions & 0 deletions metrics/alertmanager/pd.rules.test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
rule_files:
- pd.rules.yml

# Use 15s to match typical scrape interval.
evaluation_interval: 15s

tests:
- interval: 15s
name: pd-leader-lease-drop-without-failover
input_series:
# PD leader metric (service_member_role) flaps on pd-1: 1 -> 0 -> 1.
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
# 20m total at 15s step: 80 samples. Drop at minute 5 for 1m.
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

# Embedded etcd leader stays stable on pd-1.
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

# No PD restarts in the window.
- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100'

alert_rule_test:
- eval_time: 12m
alertname: PD_leader_lease_drop_without_failover
exp_alerts:
- exp_labels:
env: ENV_LABELS_ENV
level: warning
job: pd
service: PD
instance: pd-1
expr: '(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)'
exp_annotations:
summary: 'PD leader lease dropped without failover'
description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:2'
value: '2'

- interval: 15s
name: pd-leader-lease-drop-suppressed-by-leader-restart
input_series:
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

# pd-1 restarted within 15m (start_time changes).
- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
# Change at minute 9.
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500'

alert_rule_test:
- eval_time: 12m
alertname: PD_leader_lease_drop_without_failover
exp_alerts: []

- interval: 15s
name: pd-leader-lease-drop-not-suppressed-by-follower-restart
input_series:
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

- series: 'etcd_server_is_leader{job="pd",instance="pd-2"}'
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'

- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100'

# Follower restart should NOT suppress when embedded etcd leader is stable.
- series: 'process_start_time_seconds{job="pd",instance="pd-2"}'
# Change at minute 10.
values: '200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600'

alert_rule_test:
- eval_time: 12m
alertname: PD_leader_lease_drop_without_failover
exp_alerts:
- exp_labels:
env: ENV_LABELS_ENV
level: warning
job: pd
service: PD
instance: pd-1
expr: '(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)'
exp_annotations:
summary: 'PD leader lease dropped without failover'
description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:2'
value: '2'

- interval: 15s
name: pd-leader-lease-drop-suppressed-by-etcd-leader-change
input_series:
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

# Embedded etcd leader changes from pd-1 to pd-2 at minute 6.
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
# minute 0-6: 1 (24 samples), after: 0
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
- series: 'etcd_server_is_leader{job="pd",instance="pd-2"}'
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

- series: 'process_start_time_seconds{job="pd",instance="pd-1"}'
values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100'
- series: 'process_start_time_seconds{job="pd",instance="pd-2"}'
values: '200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200'

alert_rule_test:
- eval_time: 12m
alertname: PD_leader_lease_drop_without_failover
exp_alerts: []
19 changes: 18 additions & 1 deletion metrics/alertmanager/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,29 @@ groups:
labels:
env: ENV_LABELS_ENV
level: warning
expr: count( changes(pd_tso_events{type="save"}[10m]) > 0 ) >= 2
expr: count( changes(pd_tso_events{type="save"}[10m]) > 0 ) >= 2
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_leader_change

- alert: PD_leader_lease_drop_without_failover
expr: |
(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2)
and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0)
and (service_member_role{job="pd",service="PD"} == 1)
and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0)
and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't seem to detect two leader transitions within the same sampling interval. Are we fine with it?

for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: (changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:{{ $value }}'
value: '{{ $value }}'
summary: PD leader lease dropped without failover

- alert: PD_cluster_store_space_used_more_than_80%
expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80
for: 1m
Expand Down
Loading
Loading