Skip to content

Commit f57209b

Browse files
committed
Add a timeout in ruler for each rule-group evaluation
1 parent 7da18ae commit f57209b

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

pkg/ruler/ruler.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ type Config struct {
7272
NotificationQueueCapacity int
7373
// HTTP timeout duration when sending notifications to the Alertmanager.
7474
NotificationTimeout time.Duration
75+
// Timeout for rule group evaluation, including sending result to ingester
76+
GroupTimeout time.Duration
7577
}
7678

7779
// RegisterFlags adds the flags required to config this to the given FlagSet
@@ -85,6 +87,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
8587
f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing alertmanager hosts.")
8688
f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.")
8789
f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.")
90+
f.DurationVar(&cfg.GroupTimeout, "ruler.group-timeout", 10*time.Second, "Timeout for rule group evaluation, including sending result to ingester")
8891
}
8992

9093
// Ruler evaluates rules.
@@ -94,6 +97,7 @@ type Ruler struct {
9497
alertURL *url.URL
9598
notifierCfg *config.Config
9699
queueCapacity int
100+
groupTimeout time.Duration
97101

98102
// Per-user notifiers with separate queues.
99103
notifiersMtx sync.Mutex
@@ -112,6 +116,7 @@ func NewRuler(cfg Config, d *distributor.Distributor, c *chunk.Store) (*Ruler, e
112116
alertURL: cfg.ExternalURL.URL,
113117
notifierCfg: ncfg,
114118
queueCapacity: cfg.NotificationQueueCapacity,
119+
groupTimeout: cfg.GroupTimeout,
115120
notifiers: map[string]*notifier.Notifier{},
116121
}, nil
117122
}
@@ -240,12 +245,18 @@ func (r *Ruler) Evaluate(ctx context.Context, rs []rules.Rule) {
240245
logger := util.WithContext(ctx, util.Logger)
241246
level.Debug(logger).Log("msg", "evaluating rules...", "num_rules", len(rs))
242247
start := time.Now()
248+
ctx, cancelTimeout := context.WithTimeout(ctx, r.groupTimeout)
243249
g, err := r.newGroup(ctx, rs)
244250
if err != nil {
245251
level.Error(logger).Log("msg", "failed to create rule group", "err", err)
246252
return
247253
}
248254
g.Eval(start)
255+
if err := ctx.Err(); err == nil {
256+
cancelTimeout() // release resources
257+
} else {
258+
level.Warn(util.Logger).Log("msg", "context error", "error", err)
259+
}
249260

250261
// The prometheus routines we're calling have their own instrumentation
251262
// but, a) it's rule-based, not group-based, b) it's a summary, not a

0 commit comments

Comments
 (0)