Skip to content

Commit aa717c3

Browse files
authored
Add metric to track pre-validation native histogram bucket count per user (#7569)
Signed-off-by: Paurush Garg <paurushg@amazon.com>
1 parent a4e2f2f commit aa717c3

5 files changed

Lines changed: 57 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
* [ENHANCEMENT] Instrument Ingester CPU profile with source for read APIs. #7494
2929
* [ENHANCEMENT] Ingester: Convert expanded postings cache from FIFO to LRU eviction to retain frequently-queried entries under memory pressure. #7510
3030
* [ENHANCEMENT] Querier: Detach series label and chunk data from gRPC unmarshal buffers in store-gateway streaming path, allowing the Go GC to reclaim receive buffers. #7519
31-
31+
* [ENHANCEMENT] Distributor: Added `cortex_distributor_received_histogram_buckets` metric to track number of buckets in received native histogram samples before validation, per user. #7569
3232
* [ENHANCEMENT] Distributor: Add `WrappedHistogram` with configurable size limit (`-validation.max-native-histogram-size-bytes`) to cap native histogram protobuf size before unmarshalling. #7570
3333
* [ENHANCEMENT] Ingester: Add lazy regex evaluation on head postings cache miss. Defers expensive regex matchers on high-cardinality labels to per-series filtering when a selective equality matcher already narrows the result set. Configured via `-blocks-storage.expanded_postings_cache.head.lazy-matcher-max-cardinality` (disabled by default). #7553
3434
* [BUGFIX] Querier: Fix queryWithRetry and labelsWithRetry returning (nil, nil) on cancelled context by propagating ctx.Err(). #7370
@@ -47,6 +47,7 @@
4747
* [BUGFIX] Ingester: Release the TSDB appender on every early-return path in `Push` (e.g. out-of-order label set) by deferring `Rollback`. Previously such requests leaked TSDB head series references, mmap'd chunks and pending state per request, causing the `cortex_ingester_tsdb_head_active_appenders` gauge to grow unbounded. #7528
4848
* [BUGFIX] Ring: Fix ring token conflict resolution only applied to updated instance and make constantly token conflict check during instance observe period.
4949
* [BUGFIX] Distributor: Fix a panic (`slice bounds out of range`) in the stream push path when the context deadline expires while the worker goroutine is still marshalling a `WriteRequest`. #7541
50+
* [BUGFIX] Query Frontend: Fix native histogram responses not being handled correctly in `minTime()` sort ordering for split_by_interval merge. #7555
5051

5152
## 1.21.0 2026-04-24
5253

pkg/distributor/distributor.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ type Distributor struct {
124124
incomingMetadata *prometheus.CounterVec
125125
nonHASamples *prometheus.CounterVec
126126
dedupedSamples *prometheus.CounterVec
127+
receivedHistogramBuckets *prometheus.HistogramVec
127128
labelsHistogram prometheus.Histogram
128129
ingesterAppends *prometheus.CounterVec
129130
ingesterAppendFailures *prometheus.CounterVec
@@ -353,6 +354,15 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove
353354
Name: "distributor_received_metadata_total",
354355
Help: "The total number of received metadata, excluding rejected.",
355356
}, []string{"user"}),
357+
receivedHistogramBuckets: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
358+
Namespace: "cortex",
359+
Name: "distributor_received_histogram_buckets",
360+
Help: "The number of buckets in received native histogram samples before validation, per user.",
361+
NativeHistogramBucketFactor: 1.1,
362+
NativeHistogramMaxBucketNumber: 100,
363+
NativeHistogramMinResetDuration: 1 * time.Hour,
364+
Buckets: prometheus.ExponentialBuckets(1, 2, 10), // 1 to 512 buckets
365+
}, []string{"user"}),
356366
incomingSamples: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
357367
Namespace: "cortex",
358368
Name: "distributor_samples_in_total",
@@ -530,6 +540,7 @@ func (d *Distributor) cleanupInactiveUser(userID string) {
530540
d.receivedSamples.DeleteLabelValues(userID, sampleMetricTypeHistogramNHCB)
531541
d.receivedExemplars.DeleteLabelValues(userID)
532542
d.receivedMetadata.DeleteLabelValues(userID)
543+
d.receivedHistogramBuckets.DeleteLabelValues(userID)
533544
d.incomingSamples.DeleteLabelValues(userID, sampleMetricTypeFloat)
534545
d.incomingSamples.DeleteLabelValues(userID, sampleMetricTypeHistogram)
535546
d.incomingSamples.DeleteLabelValues(userID, sampleMetricTypeHistogramNHCB)
@@ -691,10 +702,12 @@ func (d *Distributor) validateSeries(ts cortexpb.PreallocTimeseries, userID stri
691702
if len(ts.Histograms) > 0 {
692703
// Only alloc when data present
693704
histograms = make([]cortexpb.WrappedHistogram, 0, len(ts.Histograms))
705+
receivedBucketsObserver := d.receivedHistogramBuckets.WithLabelValues(userID)
694706
for i, h := range ts.Histograms {
695707
if err := validation.ValidateSampleTimestamp(d.validateMetrics, limits, userID, ts.Labels, h.TimestampMs); err != nil {
696708
return emptyPreallocSeries, err
697709
}
710+
receivedBucketsObserver.Observe(float64(h.BucketCount()))
698711
convertedHistogram, err := validation.ValidateNativeHistogram(d.validateMetrics, limits, userID, ts.Labels, h.Histogram)
699712
if err != nil {
700713
return emptyPreallocSeries, err

pkg/distributor/distributor_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/go-kit/log"
2121
"github.com/prometheus/client_golang/prometheus"
2222
"github.com/prometheus/client_golang/prometheus/testutil"
23+
dto "github.com/prometheus/client_model/go"
2324
"github.com/prometheus/common/model"
2425
"github.com/prometheus/prometheus/model/labels"
2526
"github.com/prometheus/prometheus/model/relabel"
@@ -4938,3 +4939,35 @@ func TestDistributor_ShuffleShardingIngestersLookbackPeriod_Validation(t *testin
49384939
})
49394940
}
49404941
}
4942+
4943+
func TestDistributor_ReceivedHistogramBucketsMetric(t *testing.T) {
4944+
t.Parallel()
4945+
4946+
limits := &validation.Limits{}
4947+
flagext.DefaultValues(limits)
4948+
// Set a bucket limit so resolution reduction kicks in, but the metric should still capture the original count.
4949+
limits.MaxNativeHistogramBuckets = 4
4950+
4951+
ds, _, _, _ := prepare(t, prepConfig{
4952+
numIngesters: 3,
4953+
happyIngesters: 3,
4954+
numDistributors: 1,
4955+
shardByAllLabels: true,
4956+
limits: limits,
4957+
})
4958+
4959+
// Push a native histogram sample. GenerateTestHistogram produces 4 positive + 4 negative buckets + zero bucket = 9 buckets.
4960+
ctx := user.InjectOrgID(context.Background(), "user")
4961+
req := makeWriteRequest(0, 0, 0, 1)
4962+
_, err := ds[0].Push(ctx, req)
4963+
require.NoError(t, err)
4964+
4965+
// Verify the receivedHistogramBuckets metric observed the pre-validation bucket count.
4966+
m := &dto.Metric{}
4967+
observer, err := ds[0].receivedHistogramBuckets.GetMetricWithLabelValues("user")
4968+
require.NoError(t, err)
4969+
require.NoError(t, observer.(prometheus.Metric).Write(m))
4970+
require.Equal(t, uint64(1), m.GetHistogram().GetSampleCount())
4971+
// GenerateTestHistogram(0): 4 positive + 4 negative + 1 zero = 9 buckets
4972+
require.Equal(t, float64(9), m.GetHistogram().GetSampleSum())
4973+
}

pkg/ingester/ingester.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,6 +1527,7 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
15271527
}
15281528

15291529
if i.limits.EnableNativeHistograms(userID) {
1530+
ingestedBucketsObserver := i.metrics.ingestedHistogramBuckets.WithLabelValues(userID)
15301531
for _, hp := range ts.Histograms {
15311532
var (
15321533
err error
@@ -1551,7 +1552,7 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
15511552
if ref != 0 {
15521553
if _, err = app.AppendHistogram(ref, copiedLabels, hp.TimestampMs, h, fh); err == nil {
15531554
succeededHistogramsCount++
1554-
i.metrics.ingestedHistogramBuckets.WithLabelValues(userID).Observe(float64(hp.BucketCount()))
1555+
ingestedBucketsObserver.Observe(float64(hp.BucketCount()))
15551556
continue
15561557
}
15571558
} else {
@@ -1563,7 +1564,7 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
15631564
newSeries = append(newSeries, copiedLabels)
15641565
}
15651566
succeededHistogramsCount++
1566-
i.metrics.ingestedHistogramBuckets.WithLabelValues(userID).Observe(float64(hp.BucketCount()))
1567+
ingestedBucketsObserver.Observe(float64(hp.BucketCount()))
15671568
continue
15681569
}
15691570
}

pkg/ingester/metrics.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package ingester
22

33
import (
4+
"time"
5+
46
"github.com/prometheus/client_golang/prometheus"
57
"github.com/prometheus/client_golang/prometheus/promauto"
68

@@ -142,7 +144,7 @@ func newIngesterMetrics(r prometheus.Registerer,
142144
Help: "The number of ingested native histogram buckets per user.",
143145
NativeHistogramBucketFactor: 1.1,
144146
NativeHistogramMaxBucketNumber: 100,
145-
NativeHistogramMinResetDuration: 1,
147+
NativeHistogramMinResetDuration: 1 * time.Hour,
146148
Buckets: prometheus.ExponentialBuckets(1, 2, 10), // 1 to 512 buckets
147149
}, []string{"user"}),
148150
oooLabelsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
@@ -317,23 +319,23 @@ func newIngesterMetrics(r prometheus.Registerer,
317319
Help: "Length (in bytes) of unoptimized regex patterns in queries.",
318320
NativeHistogramBucketFactor: 1.1,
319321
NativeHistogramMaxBucketNumber: 100,
320-
NativeHistogramMinResetDuration: 1,
322+
NativeHistogramMinResetDuration: 1 * time.Hour,
321323
Buckets: prometheus.ExponentialBuckets(1, 2, 12), // 1 to 4096 bytes
322324
})
323325
m.unoptimizedRegexLabelCardinality = promauto.With(r).NewHistogram(prometheus.HistogramOpts{
324326
Name: "cortex_ingester_unoptimized_regex_label_cardinality",
325327
Help: "Cardinality of labels queried with unoptimized regex matchers.",
326328
NativeHistogramBucketFactor: 1.1,
327329
NativeHistogramMaxBucketNumber: 100,
328-
NativeHistogramMinResetDuration: 1,
330+
NativeHistogramMinResetDuration: 1 * time.Hour,
329331
Buckets: prometheus.ExponentialBuckets(1, 4, 10), // 1 to ~1M
330332
})
331333
m.unoptimizedRegexTotalValueLength = promauto.With(r).NewHistogram(prometheus.HistogramOpts{
332334
Name: "cortex_ingester_unoptimized_regex_total_value_length_bytes",
333335
Help: "Total length (in bytes) of all label values for labels queried with unoptimized regex matchers.",
334336
NativeHistogramBucketFactor: 1.1,
335337
NativeHistogramMaxBucketNumber: 100,
336-
NativeHistogramMinResetDuration: 1,
338+
NativeHistogramMinResetDuration: 1 * time.Hour,
337339
Buckets: prometheus.ExponentialBuckets(1, 4, 12), // 1 to ~16M bytes
338340
})
339341
m.unoptimizedRegexRejectedTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{

0 commit comments

Comments
 (0)