Skip to content

Commit fee02a5

Browse files
csmarchbanksbboreham
authored andcommitted
Add a healthcheck endpoint on the ingesters that distributors can use (#741)
* Add a healthcheck endpoint on the ingesters that distributors can use * Vendor in health protobuf stuff * Add config for turning on health check behavior * Refactored distributor client cache to ingester/client/IngesterClientCache
1 parent ff0b6ad commit fee02a5

File tree

16 files changed

+594
-135
lines changed

16 files changed

+594
-135
lines changed

Gopkg.lock

+2-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/distributor/distributor.go

+28-58
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"flag"
77
"fmt"
88
"hash/fnv"
9-
"io"
109
"sync"
1110
"sync/atomic"
1211
"time"
@@ -44,12 +43,12 @@ var (
4443
// Distributor is a storage.SampleAppender and a client.Querier which
4544
// forwards appends and queries to individual ingesters.
4645
type Distributor struct {
47-
cfg Config
48-
ring ring.ReadRing
49-
clientsMtx sync.RWMutex
50-
clients map[string]client.IngesterClient
51-
quit chan struct{}
52-
done chan struct{}
46+
cfg Config
47+
ring ring.ReadRing
48+
clientsMtx sync.RWMutex
49+
ingesterPool *ingester_client.IngesterPool
50+
quit chan struct{}
51+
done chan struct{}
5352

5453
billingClient *billing.Client
5554

@@ -73,11 +72,12 @@ type Config struct {
7372
BillingConfig billing.Config
7473
IngesterClientConfig ingester_client.Config
7574

76-
ReplicationFactor int
77-
RemoteTimeout time.Duration
78-
ClientCleanupPeriod time.Duration
79-
IngestionRateLimit float64
80-
IngestionBurstSize int
75+
ReplicationFactor int
76+
RemoteTimeout time.Duration
77+
ClientCleanupPeriod time.Duration
78+
IngestionRateLimit float64
79+
IngestionBurstSize int
80+
HealthCheckIngesters bool
8181

8282
// for testing
8383
ingesterClientFactory func(addr string, cfg ingester_client.Config) (client.IngesterClient, error)
@@ -93,6 +93,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
9393
flag.DurationVar(&cfg.ClientCleanupPeriod, "distributor.client-cleanup-period", 15*time.Second, "How frequently to clean up clients for ingesters that have gone away.")
9494
flag.Float64Var(&cfg.IngestionRateLimit, "distributor.ingestion-rate-limit", 25000, "Per-user ingestion rate limit in samples per second.")
9595
flag.IntVar(&cfg.IngestionBurstSize, "distributor.ingestion-burst-size", 50000, "Per-user allowed ingestion burst size (in number of samples).")
96+
flag.BoolVar(&cfg.HealthCheckIngesters, "distributor.health-check-ingesters", false, "Run a health check on each ingester client during periodic cleanup.")
9697
}
9798

9899
// New constructs a new Distributor
@@ -116,7 +117,7 @@ func New(cfg Config, ring ring.ReadRing) (*Distributor, error) {
116117
d := &Distributor{
117118
cfg: cfg,
118119
ring: ring,
119-
clients: map[string]client.IngesterClient{},
120+
ingesterPool: ingester_client.NewIngesterPool(cfg.ingesterClientFactory, cfg.IngesterClientConfig, cfg.RemoteTimeout),
120121
quit: make(chan struct{}),
121122
done: make(chan struct{}),
122123
billingClient: billingClient,
@@ -170,6 +171,9 @@ func (d *Distributor) Run() {
170171
select {
171172
case <-cleanupClients.C:
172173
d.removeStaleIngesterClients()
174+
if d.cfg.HealthCheckIngesters {
175+
d.ingesterPool.CleanUnhealthy()
176+
}
173177
case <-d.quit:
174178
close(d.done)
175179
return
@@ -184,52 +188,18 @@ func (d *Distributor) Stop() {
184188
}
185189

186190
func (d *Distributor) removeStaleIngesterClients() {
187-
d.clientsMtx.Lock()
188-
defer d.clientsMtx.Unlock()
189-
190191
ingesters := map[string]struct{}{}
191192
for _, ing := range d.ring.GetAll() {
192193
ingesters[ing.Addr] = struct{}{}
193194
}
194195

195-
for addr, client := range d.clients {
196+
for _, addr := range d.ingesterPool.RegisteredAddresses() {
196197
if _, ok := ingesters[addr]; ok {
197198
continue
198199
}
199200
level.Info(util.Logger).Log("msg", "removing stale ingester client", "addr", addr)
200-
delete(d.clients, addr)
201-
202-
// Do the gRPC closing in the background since it might take a while and
203-
// we're holding a mutex.
204-
go func(addr string, closer io.Closer) {
205-
if err := closer.Close(); err != nil {
206-
level.Error(util.Logger).Log("msg", "error closing connection to ingester", "ingester", addr, "err", err)
207-
}
208-
}(addr, client.(io.Closer))
209-
}
210-
}
211-
212-
func (d *Distributor) getClientFor(ingester *ring.IngesterDesc) (client.IngesterClient, error) {
213-
d.clientsMtx.RLock()
214-
client, ok := d.clients[ingester.Addr]
215-
d.clientsMtx.RUnlock()
216-
if ok {
217-
return client, nil
218-
}
219-
220-
d.clientsMtx.Lock()
221-
defer d.clientsMtx.Unlock()
222-
client, ok = d.clients[ingester.Addr]
223-
if ok {
224-
return client, nil
225-
}
226-
227-
client, err := d.cfg.ingesterClientFactory(ingester.Addr, d.cfg.IngesterClientConfig)
228-
if err != nil {
229-
return nil, err
201+
d.ingesterPool.RemoveClientFor(addr)
230202
}
231-
d.clients[ingester.Addr] = client
232-
return client, nil
233203
}
234204

235205
func tokenForLabels(userID string, labels []client.LabelPair) (uint32, error) {
@@ -412,7 +382,7 @@ func (d *Distributor) sendSamples(ctx context.Context, ingester *ring.IngesterDe
412382
}
413383

414384
func (d *Distributor) sendSamplesErr(ctx context.Context, ingester *ring.IngesterDesc, samples []*sampleTracker) error {
415-
c, err := d.getClientFor(ingester)
385+
c, err := d.ingesterPool.GetClientFor(ingester.Addr)
416386
if err != nil {
417387
return err
418388
}
@@ -449,7 +419,7 @@ func (d *Distributor) Query(ctx context.Context, from, to model.Time, matchers .
449419

450420
metricNameMatcher, _, ok := util.ExtractMetricNameMatcherFromMatchers(matchers)
451421

452-
req, err := util.ToQueryRequest(from, to, matchers)
422+
req, err := ingester_client.ToQueryRequest(from, to, matchers)
453423
if err != nil {
454424
return err
455425
}
@@ -529,7 +499,7 @@ func (d *Distributor) queryIngesters(ctx context.Context, ingesters []*ring.Inge
529499
}
530500

531501
func (d *Distributor) queryIngester(ctx context.Context, ing *ring.IngesterDesc, req *client.QueryRequest) (model.Matrix, error) {
532-
client, err := d.getClientFor(ing)
502+
client, err := d.ingesterPool.GetClientFor(ing.Addr)
533503
if err != nil {
534504
return nil, err
535505
}
@@ -541,7 +511,7 @@ func (d *Distributor) queryIngester(ctx context.Context, ing *ring.IngesterDesc,
541511
return nil, err
542512
}
543513

544-
return util.FromQueryResponse(resp), nil
514+
return ingester_client.FromQueryResponse(resp), nil
545515
}
546516

547517
// forAllIngesters runs f, in parallel, for all ingesters
@@ -550,7 +520,7 @@ func (d *Distributor) forAllIngesters(f func(client.IngesterClient) (interface{}
550520
ingesters := d.ring.GetAll()
551521
for _, ingester := range ingesters {
552522
go func(ingester *ring.IngesterDesc) {
553-
client, err := d.getClientFor(ingester)
523+
client, err := d.ingesterPool.GetClientFor(ingester.Addr)
554524
if err != nil {
555525
errs <- err
556526
return
@@ -609,7 +579,7 @@ func (d *Distributor) LabelValuesForLabelName(ctx context.Context, labelName mod
609579

610580
// MetricsForLabelMatchers gets the metrics that match said matchers
611581
func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, matchers ...*labels.Matcher) ([]metric.Metric, error) {
612-
req, err := util.ToMetricsForLabelMatchersRequest(from, through, matchers)
582+
req, err := ingester_client.ToMetricsForLabelMatchersRequest(from, through, matchers)
613583
if err != nil {
614584
return nil, err
615585
}
@@ -623,7 +593,7 @@ func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through
623593

624594
metrics := map[model.Fingerprint]model.Metric{}
625595
for _, resp := range resps {
626-
ms := util.FromMetricsForLabelMatchersResponse(resp.(*client.MetricsForLabelMatchersResponse))
596+
ms := ingester_client.FromMetricsForLabelMatchersResponse(resp.(*client.MetricsForLabelMatchersResponse))
627597
for _, m := range ms {
628598
metrics[m.Fingerprint()] = m
629599
}
@@ -677,7 +647,7 @@ func (d *Distributor) AllUserStats(ctx context.Context) ([]UserIDStats, error) {
677647
// Not using d.forAllIngesters(), so we can fail after first error.
678648
ingesters := d.ring.GetAll()
679649
for _, ingester := range ingesters {
680-
client, err := d.getClientFor(ingester)
650+
client, err := d.ingesterPool.GetClientFor(ingester.Addr)
681651
if err != nil {
682652
return nil, err
683653
}
@@ -736,6 +706,6 @@ func (d *Distributor) Collect(ch chan<- prometheus.Metric) {
736706
ch <- prometheus.MustNewConstMetric(
737707
numClientsDesc,
738708
prometheus.GaugeValue,
739-
float64(len(d.clients)),
709+
float64(d.ingesterPool.Count()),
740710
)
741711
}

0 commit comments

Comments
 (0)