Skip to content

Commit 9b720c7

Browse files
authored
Merge pull request #986 from weaveworks/cortex-monitoring
Add dashboard for the cortex ring, and alert on unhealthy ingesters
2 parents 833d056 + 94784ce commit 9b720c7

File tree

3 files changed

+231
-1
lines changed

3 files changed

+231
-1
lines changed

monitoring/grafana/cortex-ring.json

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
{
2+
"annotations": {
3+
"list": []
4+
},
5+
"editable": true,
6+
"gnetId": null,
7+
"hideControls": false,
8+
"id": null,
9+
"links": [],
10+
"rows": [
11+
{
12+
"collapse": false,
13+
"editable": true,
14+
"height": "250px",
15+
"panels": [
16+
{
17+
"aliasColors": {},
18+
"bars": false,
19+
"datasource": "Scope-as-a-Service Prometheus",
20+
"editable": true,
21+
"error": false,
22+
"fill": 1,
23+
"grid": {
24+
"threshold1": null,
25+
"threshold1Color": "rgba(216, 200, 27, 0.27)",
26+
"threshold2": null,
27+
"threshold2Color": "rgba(234, 112, 112, 0.22)"
28+
},
29+
"id": 1,
30+
"isNew": true,
31+
"legend": {
32+
"avg": false,
33+
"current": false,
34+
"max": false,
35+
"min": false,
36+
"show": true,
37+
"total": false,
38+
"values": false
39+
},
40+
"lines": true,
41+
"linewidth": 2,
42+
"links": [],
43+
"nullPointMode": "connected",
44+
"percentage": false,
45+
"pointradius": 5,
46+
"points": false,
47+
"renderer": "flot",
48+
"seriesOverrides": [],
49+
"span": 6,
50+
"stack": true,
51+
"steppedLine": false,
52+
"targets": [
53+
{
54+
"expr": "max(cortex_ring_ingester_ownership_percent{job=\"cortex/distributor\"}) by (ingester)",
55+
"intervalFactor": 2,
56+
"legendFormat": "",
57+
"refId": "A",
58+
"step": 60
59+
}
60+
],
61+
"timeFrom": null,
62+
"timeShift": null,
63+
"title": "Ingester Ring Ownership",
64+
"tooltip": {
65+
"msResolution": true,
66+
"shared": true,
67+
"sort": 0,
68+
"value_type": "cumulative"
69+
},
70+
"type": "graph",
71+
"xaxis": {
72+
"show": true
73+
},
74+
"yaxes": [
75+
{
76+
"format": "percentunit",
77+
"label": "",
78+
"logBase": 1,
79+
"max": null,
80+
"min": null,
81+
"show": true
82+
},
83+
{
84+
"format": "short",
85+
"label": null,
86+
"logBase": 1,
87+
"max": null,
88+
"min": null,
89+
"show": true
90+
}
91+
]
92+
},
93+
{
94+
"aliasColors": {},
95+
"bars": false,
96+
"datasource": "Scope-as-a-Service Prometheus",
97+
"editable": true,
98+
"error": false,
99+
"fill": 1,
100+
"grid": {
101+
"threshold1": null,
102+
"threshold1Color": "rgba(216, 200, 27, 0.27)",
103+
"threshold2": null,
104+
"threshold2Color": "rgba(234, 112, 112, 0.22)"
105+
},
106+
"id": 2,
107+
"isNew": true,
108+
"legend": {
109+
"avg": false,
110+
"current": false,
111+
"max": false,
112+
"min": false,
113+
"show": true,
114+
"total": false,
115+
"values": false
116+
},
117+
"lines": true,
118+
"linewidth": 2,
119+
"links": [],
120+
"nullPointMode": "connected",
121+
"percentage": false,
122+
"pointradius": 5,
123+
"points": false,
124+
"renderer": "flot",
125+
"seriesOverrides": [],
126+
"span": 6,
127+
"stack": false,
128+
"steppedLine": false,
129+
"targets": [
130+
{
131+
"expr": "cortex_ring_ingesters{job=\"cortex/distributor\"}",
132+
"intervalFactor": 2,
133+
"refId": "A",
134+
"step": 60
135+
}
136+
],
137+
"timeFrom": null,
138+
"timeShift": null,
139+
"title": "Ingesters In Ring",
140+
"tooltip": {
141+
"msResolution": true,
142+
"shared": true,
143+
"sort": 0,
144+
"value_type": "cumulative"
145+
},
146+
"type": "graph",
147+
"xaxis": {
148+
"show": true
149+
},
150+
"yaxes": [
151+
{
152+
"format": "none",
153+
"label": null,
154+
"logBase": 1,
155+
"max": null,
156+
"min": null,
157+
"show": true
158+
},
159+
{
160+
"format": "short",
161+
"label": null,
162+
"logBase": 1,
163+
"max": null,
164+
"min": null,
165+
"show": true
166+
}
167+
]
168+
}
169+
],
170+
"title": "Row"
171+
},
172+
{
173+
"collapse": false,
174+
"editable": true,
175+
"height": "250px",
176+
"panels": [],
177+
"title": "New row"
178+
}
179+
],
180+
"schemaVersion": 12,
181+
"sharedCrosshair": false,
182+
"style": "dark",
183+
"tags": [],
184+
"templating": {
185+
"list": []
186+
},
187+
"time": {
188+
"from": "now-6h",
189+
"to": "now"
190+
},
191+
"timepicker": {
192+
"refresh_intervals": [
193+
"5s",
194+
"10s",
195+
"30s",
196+
"1m",
197+
"5m",
198+
"15m",
199+
"30m",
200+
"1h",
201+
"2h",
202+
"1d"
203+
],
204+
"time_options": [
205+
"5m",
206+
"15m",
207+
"1h",
208+
"6h",
209+
"12h",
210+
"24h",
211+
"2d",
212+
"7d",
213+
"30d"
214+
]
215+
},
216+
"timezone": "utc",
217+
"title": "Cortex (ring stats)",
218+
"version": 0
219+
}

monitoring/grafana/cortex.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3245,6 +3245,6 @@
32453245
]
32463246
},
32473247
"timezone": "utc",
3248-
"title": "Cortex",
3248+
"title": "Cortex Services",
32493249
"version": 0
32503250
}

monitoring/prometheus/alert.rules

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,14 @@ ALERT MarketingFail
192192
summary = "We're failing to push prospects to marketing.",
193193
description = "We're failing to push prospects to marketing.",
194194
}
195+
196+
# Cortex alerts
197+
198+
ALERT CortexIngesterUnhealthy
199+
IF max without(instance) (cortex_ring_ingesters{job="cortex/distributor", state="unhealthy"}) > 0
200+
FOR 15m
201+
LABELS { severity="warning" }
202+
ANNOTATIONS {
203+
summary = "Cortex has {{ $value }} unhealthy ingesters!",
204+
description = "Cortex has {{ $value }} unhealthy ingesters!",
205+
}

0 commit comments

Comments
 (0)