Skip to content

Commit f93ace9

Browse files
committed
Make metrics query expressions configurable
1 parent dc1a5ca commit f93ace9

File tree

1 file changed

+21
-11
lines changed

1 file changed

+21
-11
lines changed

pkg/chunk/aws/metrics_autoscaling.go

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,35 @@ const (
2424
targetMax = 10 // always scale up if queue bigger than this times target
2525
errorFractionScaledown = 0.1
2626
minUsageForScaledown = 100 // only scale down if usage is > this DynamoDB units/sec
27+
28+
// fetch Ingester queue length
29+
// average the queue length over 2 minutes to avoid aliasing with the 1-minute flush period
30+
defaultQueueLenQuery = `sum(avg_over_time(cortex_ingester_flush_queue_length{job="cortex/ingester"}[2m]))`
31+
// fetch write error rate per DynamoDB table
32+
defaultErrorRateQuery = `sum(rate(cortex_dynamo_failures_total{error="ProvisionedThroughputExceededException",operation=~".*Write.*"}[1m])) by (table) > 0`
33+
// fetch write capacity usage per DynamoDB table
34+
// use the rate over 15 minutes so we take a broad average
35+
defaultUsageQuery = `sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.BatchWriteItem"}[15m])) by (table) > 0`
2736
)
2837

2938
// MetricsAutoScalingConfig holds parameters to configure how it works
3039
type MetricsAutoScalingConfig struct {
31-
URL string // URL to contact Prometheus store on
32-
TargetQueueLen int64 // Queue length above which we will scale up capacity
33-
ScaleUpFactor float64 // Scale up capacity by this multiple
40+
URL string // URL to contact Prometheus store on
41+
TargetQueueLen int64 // Queue length above which we will scale up capacity
42+
ScaleUpFactor float64 // Scale up capacity by this multiple
43+
QueueLengthQuery string // Promql query to fetch ingester queue length
44+
ErrorRateQuery string // Promql query to fetch error rates per table
45+
UsageQuery string // Promql query to fetch write capacity usage per table
3446
}
3547

3648
// RegisterFlags adds the flags required to config this to the given FlagSet
3749
func (cfg *MetricsAutoScalingConfig) RegisterFlags(f *flag.FlagSet) {
3850
f.StringVar(&cfg.URL, "metrics.url", "", "Use metrics-based autoscaling, via this query URL")
3951
f.Int64Var(&cfg.TargetQueueLen, "metrics.target-queue-length", 100000, "Queue length above which we will scale up capacity")
4052
f.Float64Var(&cfg.ScaleUpFactor, "metrics.scale-up-factor", 1.3, "Scale up capacity by this multiple")
53+
f.StringVar(&cfg.QueueLengthQuery, "metrics.queue-length-query", defaultQueueLenQuery, "query to fetch ingester queue length")
54+
f.StringVar(&cfg.ErrorRateQuery, "metrics.error-rate-query", defaultErrorRateQuery, "query to fetch error rates per table")
55+
f.StringVar(&cfg.UsageQuery, "metrics.usage-query", defaultUsageQuery, "query to fetch write capacity usage per table")
4156
}
4257

4358
type metricsData struct {
@@ -176,9 +191,7 @@ func (m *metricsData) update(ctx context.Context) error {
176191
}
177192

178193
m.promLastQuery = mtime.Now()
179-
// average the queue length over 2 minutes to avoid aliasing with the 1-minute flush period
180-
// TODO: adjust that 2m depending on configuration of the flush period
181-
qlMatrix, err := promQuery(ctx, m.promAPI, `sum(avg_over_time(cortex_ingester_flush_queue_length{job="cortex/ingester"}[2m]))`, queueObservationPeriod, queueObservationPeriod/2)
194+
qlMatrix, err := promQuery(ctx, m.promAPI, m.cfg.QueueLengthQuery, queueObservationPeriod, queueObservationPeriod/2)
182195
if err != nil {
183196
return err
184197
}
@@ -193,18 +206,15 @@ func (m *metricsData) update(ctx context.Context) error {
193206
m.queueLengths[i] = float64(v.Value)
194207
}
195208

196-
// fetch write error rate per DynamoDB table
197-
deMatrix, err := promQuery(ctx, m.promAPI, `sum(rate(cortex_dynamo_failures_total{error="ProvisionedThroughputExceededException",operation=~".*Write.*"}[1m])) by (table) > 0`, 0, time.Second)
209+
deMatrix, err := promQuery(ctx, m.promAPI, m.cfg.ErrorRateQuery, 0, time.Second)
198210
if err != nil {
199211
return err
200212
}
201213
if m.errorRates, err = extractRates(deMatrix); err != nil {
202214
return err
203215
}
204216

205-
// fetch write capacity usage per DynamoDB table
206-
// use the rate over 15 minutes so we take a broad average
207-
usageMatrix, err := promQuery(ctx, m.promAPI, `sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.BatchWriteItem"}[15m])) by (table) > 0`, 0, time.Second)
217+
usageMatrix, err := promQuery(ctx, m.promAPI, m.cfg.UsageQuery, 0, time.Second)
208218
if err != nil {
209219
return err
210220
}

0 commit comments

Comments
 (0)