1
1
package ruler
2
2
3
3
import (
4
+ native_ctx "context"
5
+ "crypto/md5"
6
+ "encoding/json"
4
7
"flag"
5
8
"fmt"
6
9
"net/http"
@@ -12,11 +15,17 @@ import (
12
15
gklog "github.com/go-kit/kit/log"
13
16
"github.com/go-kit/kit/log/level"
14
17
"github.com/prometheus/client_golang/prometheus"
18
+ config_util "github.com/prometheus/common/config"
15
19
"github.com/prometheus/common/model"
16
20
"github.com/prometheus/prometheus/config"
21
+ "github.com/prometheus/prometheus/discovery"
22
+ sd_config "github.com/prometheus/prometheus/discovery/config"
23
+ "github.com/prometheus/prometheus/discovery/dns"
24
+ "github.com/prometheus/prometheus/discovery/targetgroup"
17
25
"github.com/prometheus/prometheus/notifier"
18
26
"github.com/prometheus/prometheus/promql"
19
27
"github.com/prometheus/prometheus/rules"
28
+ "github.com/prometheus/prometheus/util/strutil"
20
29
"golang.org/x/net/context"
21
30
"golang.org/x/net/context/ctxhttp"
22
31
@@ -97,7 +106,67 @@ type Ruler struct {
97
106
98
107
// Per-user notifiers with separate queues.
99
108
notifiersMtx sync.Mutex
100
- notifiers map [string ]* notifier.Notifier
109
+ notifiers map [string ]* rulerNotifier
110
+ }
111
+
112
+ type rulerNotifier struct {
113
+ notifier * notifier.Notifier
114
+ sdCtx context.Context
115
+ sdCancel context.CancelFunc
116
+ sdManager * discovery.Manager
117
+ wg sync.WaitGroup
118
+ logger gklog.Logger
119
+ }
120
+
121
+ func newRulerNotifier (o * notifier.Options , l gklog.Logger ) * rulerNotifier {
122
+ ctx , cancel := context .WithCancel (context .Background ())
123
+ return & rulerNotifier {
124
+ notifier : notifier .New (o , l ),
125
+ sdCtx : ctx ,
126
+ sdCancel : cancel ,
127
+ sdManager : discovery .NewManager (l ),
128
+ logger : l ,
129
+ }
130
+ }
131
+
132
+ func (rn * rulerNotifier ) run () {
133
+ rn .wg .Add (2 )
134
+ go func () {
135
+ if err := rn .sdManager .Run (rn .sdCtx ); err != nil {
136
+ level .Error (rn .logger ).Log ("msg" , "error starting notifier discovery manager" , "err" , err )
137
+ }
138
+ rn .wg .Done ()
139
+ }()
140
+ go func () {
141
+ rn .notifier .Run (rn .sdManager .SyncCh ())
142
+ rn .wg .Done ()
143
+ }()
144
+ }
145
+
146
+ func (rn * rulerNotifier ) applyConfig (cfg * config.Config ) error {
147
+ if err := rn .notifier .ApplyConfig (cfg ); err != nil {
148
+ return err
149
+ }
150
+
151
+ sdCfgs := make (map [string ]sd_config.ServiceDiscoveryConfig )
152
+ for _ , v := range cfg .AlertingConfig .AlertmanagerConfigs {
153
+ // AlertmanagerConfigs doesn't hold an unique identifier so we use the config hash as the identifier.
154
+ b , err := json .Marshal (v )
155
+ if err != nil {
156
+ return err
157
+ }
158
+ // This hash needs to be identical to the one computed in the notifier in
159
+ // https://github.com/prometheus/prometheus/blob/719c579f7b917b384c3d629752dea026513317dc/notifier/notifier.go#L265
160
+ // This kind of sucks, but it's done in Prometheus in main.go in the same way.
161
+ sdCfgs [fmt .Sprintf ("%x" , md5 .Sum (b ))] = v .ServiceDiscoveryConfig
162
+ }
163
+ return rn .sdManager .ApplyConfig (sdCfgs )
164
+ }
165
+
166
+ func (rn * rulerNotifier ) stop () {
167
+ rn .sdCancel ()
168
+ rn .notifier .Stop ()
169
+ rn .wg .Wait ()
101
170
}
102
171
103
172
// NewRuler creates a new ruler from a distributor and chunk store.
@@ -112,7 +181,7 @@ func NewRuler(cfg Config, d *distributor.Distributor, c *chunk.Store) (*Ruler, e
112
181
alertURL : cfg .ExternalURL .URL ,
113
182
notifierCfg : ncfg ,
114
183
queueCapacity : cfg .NotificationQueueCapacity ,
115
- notifiers : map [string ]* notifier. Notifier {},
184
+ notifiers : map [string ]* rulerNotifier {},
116
185
}, nil
117
186
}
118
187
@@ -124,23 +193,23 @@ func buildNotifierConfig(rulerConfig *Config) (*config.Config, error) {
124
193
}
125
194
126
195
u := rulerConfig .AlertmanagerURL
127
- var sdConfig config .ServiceDiscoveryConfig
196
+ var sdConfig sd_config .ServiceDiscoveryConfig
128
197
if rulerConfig .AlertmanagerDiscovery {
129
198
if ! strings .Contains (u .Host , "_tcp." ) {
130
199
return nil , fmt .Errorf ("When alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is %q)" , u .Host )
131
200
}
132
- dnsSDConfig := config. DNSSDConfig {
201
+ dnsSDConfig := dns. SDConfig {
133
202
Names : []string {u .Host },
134
203
RefreshInterval : model .Duration (rulerConfig .AlertmanagerRefreshInterval ),
135
204
Type : "SRV" ,
136
205
Port : 0 , // Ignored, because of SRV.
137
206
}
138
- sdConfig = config .ServiceDiscoveryConfig {
139
- DNSSDConfigs : []* config. DNSSDConfig {& dnsSDConfig },
207
+ sdConfig = sd_config .ServiceDiscoveryConfig {
208
+ DNSSDConfigs : []* dns. SDConfig {& dnsSDConfig },
140
209
}
141
210
} else {
142
- sdConfig = config .ServiceDiscoveryConfig {
143
- StaticConfigs : []* config. TargetGroup {
211
+ sdConfig = sd_config .ServiceDiscoveryConfig {
212
+ StaticConfigs : []* targetgroup. Group {
144
213
{
145
214
Targets : []model.LabelSet {
146
215
{
@@ -165,14 +234,14 @@ func buildNotifierConfig(rulerConfig *Config) (*config.Config, error) {
165
234
}
166
235
167
236
if u .User != nil {
168
- amConfig .HTTPClientConfig = config .HTTPClientConfig {
169
- BasicAuth : & config .BasicAuth {
237
+ amConfig .HTTPClientConfig = config_util .HTTPClientConfig {
238
+ BasicAuth : & config_util .BasicAuth {
170
239
Username : u .User .Username (),
171
240
},
172
241
}
173
242
174
243
if password , isSet := u .User .Password (); isSet {
175
- amConfig .HTTPClientConfig .BasicAuth .Password = config .Secret (password )
244
+ amConfig .HTTPClientConfig .BasicAuth .Password = config_util .Secret (password )
176
245
}
177
246
}
178
247
@@ -191,26 +260,59 @@ func (r *Ruler) newGroup(ctx context.Context, rs []rules.Rule) (*rules.Group, er
191
260
}
192
261
opts := & rules.ManagerOptions {
193
262
Appendable : appendable ,
194
- QueryEngine : r .engine ,
263
+ QueryFunc : rules . EngineQueryFunc ( r .engine ) ,
195
264
Context : ctx ,
196
265
ExternalURL : r .alertURL ,
197
- Notifier : notifier ,
266
+ NotifyFunc : sendAlerts ( notifier , r . alertURL . String ()) ,
198
267
Logger : gklog .NewNopLogger (),
268
+ Registerer : prometheus .DefaultRegisterer ,
199
269
}
200
270
delay := 0 * time .Second // Unused, so 0 value is fine.
201
271
return rules .NewGroup ("default" , "none" , delay , rs , opts ), nil
202
272
}
203
273
274
+ // sendAlerts implements a the rules.NotifyFunc for a Notifier.
275
+ // It filters any non-firing alerts from the input.
276
+ //
277
+ // Copied from Prometheus's main.go.
278
+ func sendAlerts (n * notifier.Notifier , externalURL string ) rules.NotifyFunc {
279
+ return func (ctx native_ctx.Context , expr string , alerts ... * rules.Alert ) error {
280
+ var res []* notifier.Alert
281
+
282
+ for _ , alert := range alerts {
283
+ // Only send actually firing alerts.
284
+ if alert .State == rules .StatePending {
285
+ continue
286
+ }
287
+ a := & notifier.Alert {
288
+ StartsAt : alert .FiredAt ,
289
+ Labels : alert .Labels ,
290
+ Annotations : alert .Annotations ,
291
+ GeneratorURL : externalURL + strutil .TableLinkForExpression (expr ),
292
+ }
293
+ if ! alert .ResolvedAt .IsZero () {
294
+ a .EndsAt = alert .ResolvedAt
295
+ }
296
+ res = append (res , a )
297
+ }
298
+
299
+ if len (alerts ) > 0 {
300
+ n .Send (res ... )
301
+ }
302
+ return nil
303
+ }
304
+ }
305
+
204
306
func (r * Ruler ) getOrCreateNotifier (userID string ) (* notifier.Notifier , error ) {
205
307
r .notifiersMtx .Lock ()
206
308
defer r .notifiersMtx .Unlock ()
207
309
208
310
n , ok := r .notifiers [userID ]
209
311
if ok {
210
- return n , nil
312
+ return n . notifier , nil
211
313
}
212
314
213
- n = notifier . New (& notifier.Options {
315
+ n = newRulerNotifier (& notifier.Options {
214
316
QueueCapacity : r .queueCapacity ,
215
317
Do : func (ctx context.Context , client * http.Client , req * http.Request ) (* http.Response , error ) {
216
318
// Note: The passed-in context comes from the Prometheus rule group code
@@ -222,17 +324,18 @@ func (r *Ruler) getOrCreateNotifier(userID string) (*notifier.Notifier, error) {
222
324
}
223
325
return ctxhttp .Do (ctx , client , req )
224
326
},
225
- }, gklog .NewNopLogger ())
327
+ }, util .Logger )
328
+
329
+ go n .run ()
226
330
227
331
// This should never fail, unless there's a programming mistake.
228
- if err := n .ApplyConfig (r .notifierCfg ); err != nil {
332
+ if err := n .applyConfig (r .notifierCfg ); err != nil {
229
333
return nil , err
230
334
}
231
- go n .Run ()
232
335
233
336
// TODO: Remove notifiers for stale users. Right now this is a slow leak.
234
337
r .notifiers [userID ] = n
235
- return n , nil
338
+ return n . notifier , nil
236
339
}
237
340
238
341
// Evaluate a list of rules in the given context.
@@ -245,7 +348,7 @@ func (r *Ruler) Evaluate(ctx context.Context, rs []rules.Rule) {
245
348
level .Error (logger ).Log ("msg" , "failed to create rule group" , "err" , err )
246
349
return
247
350
}
248
- g .Eval (start )
351
+ g .Eval (ctx , start )
249
352
250
353
// The prometheus routines we're calling have their own instrumentation
251
354
// but, a) it's rule-based, not group-based, b) it's a summary, not a
@@ -260,7 +363,7 @@ func (r *Ruler) Stop() {
260
363
defer r .notifiersMtx .Unlock ()
261
364
262
365
for _ , n := range r .notifiers {
263
- n .Stop ()
366
+ n .stop ()
264
367
}
265
368
}
266
369
0 commit comments