|
| 1 | +rule_files: |
| 2 | + - pd.rules.yml |
| 3 | + |
| 4 | +# Use 15s to match typical scrape interval. |
| 5 | +evaluation_interval: 15s |
| 6 | + |
| 7 | +tests: |
| 8 | + - interval: 15s |
| 9 | + name: pd-leader-lease-drop-without-failover |
| 10 | + input_series: |
| 11 | + # PD leader metric (service_member_role) flaps on pd-1: 1 -> 0 -> 1. |
| 12 | + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' |
| 13 | + # 20m total at 15s step: 80 samples. Drop at minute 5 for 1m. |
| 14 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 15 | + |
| 16 | + # Embedded etcd leader stays stable on pd-1. |
| 17 | + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' |
| 18 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 19 | + |
| 20 | + # No PD restarts in the window. |
| 21 | + - series: 'process_start_time_seconds{job="pd",instance="pd-1"}' |
| 22 | + values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100' |
| 23 | + |
| 24 | + alert_rule_test: |
| 25 | + - eval_time: 12m |
| 26 | + alertname: PD_leader_lease_drop_without_failover |
| 27 | + exp_alerts: |
| 28 | + - exp_labels: |
| 29 | + env: ENV_LABELS_ENV |
| 30 | + level: warning |
| 31 | + job: pd |
| 32 | + service: PD |
| 33 | + instance: pd-1 |
| 34 | + expr: '(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)' |
| 35 | + exp_annotations: |
| 36 | + summary: 'PD leader lease dropped without failover' |
| 37 | + description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:2' |
| 38 | + value: '2' |
| 39 | + |
| 40 | + - interval: 15s |
| 41 | + name: pd-leader-lease-drop-suppressed-by-leader-restart |
| 42 | + input_series: |
| 43 | + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' |
| 44 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 45 | + |
| 46 | + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' |
| 47 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 48 | + |
| 49 | + # pd-1 restarted within 15m (start_time changes). |
| 50 | + - series: 'process_start_time_seconds{job="pd",instance="pd-1"}' |
| 51 | + # Change at minute 9. |
| 52 | + values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500' |
| 53 | + |
| 54 | + alert_rule_test: |
| 55 | + - eval_time: 12m |
| 56 | + alertname: PD_leader_lease_drop_without_failover |
| 57 | + exp_alerts: [] |
| 58 | + |
| 59 | + - interval: 15s |
| 60 | + name: pd-leader-lease-drop-not-suppressed-by-follower-restart |
| 61 | + input_series: |
| 62 | + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' |
| 63 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 64 | + |
| 65 | + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' |
| 66 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 67 | + |
| 68 | + - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' |
| 69 | + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' |
| 70 | + |
| 71 | + - series: 'process_start_time_seconds{job="pd",instance="pd-1"}' |
| 72 | + values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100' |
| 73 | + |
| 74 | + # Follower restart should NOT suppress when embedded etcd leader is stable. |
| 75 | + - series: 'process_start_time_seconds{job="pd",instance="pd-2"}' |
| 76 | + # Change at minute 10. |
| 77 | + values: '200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600 600' |
| 78 | + |
| 79 | + alert_rule_test: |
| 80 | + - eval_time: 12m |
| 81 | + alertname: PD_leader_lease_drop_without_failover |
| 82 | + exp_alerts: |
| 83 | + - exp_labels: |
| 84 | + env: ENV_LABELS_ENV |
| 85 | + level: warning |
| 86 | + job: pd |
| 87 | + service: PD |
| 88 | + instance: pd-1 |
| 89 | + expr: '(changes(service_member_role{job="pd",service="PD"}[10m]) >= 2) and (min_over_time(service_member_role{job="pd",service="PD"}[10m]) == 0) and (service_member_role{job="pd",service="PD"} == 1) and on() (sum(changes(etcd_server_is_leader{job="pd"}[10m])) == 0) and on() (sum(changes(process_start_time_seconds{job="pd"}[10m]) * on(instance,job) etcd_server_is_leader{job="pd"}) == 0)' |
| 90 | + exp_annotations: |
| 91 | + summary: 'PD leader lease dropped without failover' |
| 92 | + description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD leader lease dropped and recovered on the same node, while embedded etcd leader stayed stable; values:2' |
| 93 | + value: '2' |
| 94 | + |
| 95 | + - interval: 15s |
| 96 | + name: pd-leader-lease-drop-suppressed-by-etcd-leader-change |
| 97 | + input_series: |
| 98 | + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' |
| 99 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 100 | + |
| 101 | + # Embedded etcd leader changes from pd-1 to pd-2 at minute 6. |
| 102 | + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' |
| 103 | + # minute 0-6: 1 (24 samples), after: 0 |
| 104 | + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' |
| 105 | + - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' |
| 106 | + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' |
| 107 | + |
| 108 | + - series: 'process_start_time_seconds{job="pd",instance="pd-1"}' |
| 109 | + values: '100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100' |
| 110 | + - series: 'process_start_time_seconds{job="pd",instance="pd-2"}' |
| 111 | + values: '200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200' |
| 112 | + |
| 113 | + alert_rule_test: |
| 114 | + - eval_time: 12m |
| 115 | + alertname: PD_leader_lease_drop_without_failover |
| 116 | + exp_alerts: [] |
0 commit comments