-
Notifications
You must be signed in to change notification settings - Fork 816
Add per-token state, and use it to not write to Leaving ingesters #90
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ import ( | |
"fmt" | ||
"hash/fnv" | ||
"sync" | ||
"sync/atomic" | ||
"time" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
|
@@ -46,7 +47,8 @@ type Distributor struct { | |
type ReadRing interface { | ||
prometheus.Collector | ||
|
||
Get(key uint32, n int) ([]ring.IngesterDesc, error) | ||
Get(key uint32, n int, op ring.Operation) ([]ring.IngesterDesc, error) | ||
BatchGet(keys []uint32, n int, op ring.Operation) ([][]ring.IngesterDesc, error) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm confused. If this is a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes we're always reading from the ring, but the operation says what that info is for - so we read from the ring when writing samples and when processing queries. The way we handle leaving node depends on why we're asking... |
||
GetAll() []ring.IngesterDesc | ||
} | ||
|
||
|
@@ -61,7 +63,6 @@ type DistributorConfig struct { | |
|
||
ReplicationFactor int | ||
MinReadSuccesses int | ||
MinWriteSuccesses int | ||
HeartbeatTimeout time.Duration | ||
} | ||
|
||
|
@@ -70,9 +71,6 @@ func NewDistributor(cfg DistributorConfig) (*Distributor, error) { | |
if 0 > cfg.ReplicationFactor { | ||
return nil, fmt.Errorf("ReplicationFactor must be greater than zero: %d", cfg.ReplicationFactor) | ||
} | ||
if cfg.MinWriteSuccesses > cfg.ReplicationFactor { | ||
return nil, fmt.Errorf("MinWriteSuccesses > ReplicationFactor: %d > %d", cfg.MinWriteSuccesses, cfg.ReplicationFactor) | ||
} | ||
if cfg.MinReadSuccesses > cfg.ReplicationFactor { | ||
return nil, fmt.Errorf("MinReadSuccesses > ReplicationFactor: %d > %d", cfg.MinReadSuccesses, cfg.ReplicationFactor) | ||
} | ||
|
@@ -154,6 +152,12 @@ func tokenFor(userID string, name model.LabelValue) uint32 { | |
return h.Sum32() | ||
} | ||
|
||
type sampleTracker struct { | ||
sample *model.Sample | ||
minSuccess int | ||
succeeded int32 | ||
} | ||
|
||
// Append implements SampleAppender. | ||
func (d *Distributor) Append(ctx context.Context, samples []*model.Sample) error { | ||
userID, err := user.GetID(ctx) | ||
|
@@ -163,53 +167,90 @@ func (d *Distributor) Append(ctx context.Context, samples []*model.Sample) error | |
|
||
d.receivedSamples.Add(float64(len(samples))) | ||
|
||
samplesByIngester := map[string][]*model.Sample{} | ||
for _, sample := range samples { | ||
key := tokenForMetric(userID, sample.Metric) | ||
ingesters, err := d.cfg.Ring.Get(key, d.cfg.ReplicationFactor) | ||
if err != nil { | ||
return err | ||
keys := make([]uint32, len(samples), len(samples)) | ||
for i, sample := range samples { | ||
keys[i] = tokenForMetric(userID, sample.Metric) | ||
} | ||
|
||
ingesters, err := d.cfg.Ring.BatchGet(keys, d.cfg.ReplicationFactor, ring.Write) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
sampleTrackers := make([]sampleTracker, len(samples), len(samples)) | ||
samplesByIngester := map[string][]*sampleTracker{} | ||
for i := range samples { | ||
sampleTrackers[i] = sampleTracker{ | ||
sample: samples[i], | ||
// We need a response from a quorum of ingesters, which is n/2 + 1. | ||
minSuccess: (len(ingesters[i]) / 2) + 1, | ||
succeeded: 0, | ||
} | ||
for _, ingester := range ingesters { | ||
otherSamples := samplesByIngester[ingester.Hostname] | ||
samplesByIngester[ingester.Hostname] = append(otherSamples, sample) | ||
|
||
// Skip those that have not heartbeated in a while. NB these are still | ||
// included in the calculation of minSuccess, so if too many failed ingesters | ||
// will cause the whole write to fail. | ||
liveIngesters := make([]string, 0, len(ingesters[i])) | ||
for _, ingester := range ingesters[i] { | ||
if time.Now().Sub(ingester.Timestamp) <= d.cfg.HeartbeatTimeout { | ||
liveIngesters = append(liveIngesters, ingester.Hostname) | ||
} | ||
} | ||
|
||
// This is just a shortcut - if there are not minSuccess available ingesters, | ||
// after filtering out dead ones, don't even both trying. | ||
if len(liveIngesters) < sampleTrackers[i].minSuccess { | ||
return fmt.Errorf("wanted at least %d live ingesters to process write, had %d", | ||
sampleTrackers[i].minSuccess, len(liveIngesters)) | ||
} | ||
|
||
for _, liveIngester := range liveIngesters { | ||
sampleForIngester := samplesByIngester[liveIngester] | ||
samplesByIngester[liveIngester] = append(sampleForIngester, &sampleTrackers[i]) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a feeling this |
||
} | ||
|
||
errs := make(chan error) | ||
for hostname, samples := range samplesByIngester { | ||
go func(hostname string, samples []*model.Sample) { | ||
go func(hostname string, samples []*sampleTracker) { | ||
errs <- d.sendSamples(ctx, hostname, samples) | ||
}(hostname, samples) | ||
} | ||
var lastErr error | ||
successes := 0 | ||
for i := 0; i < len(samplesByIngester); i++ { | ||
if err := <-errs; err != nil { | ||
lastErr = err | ||
continue | ||
} | ||
successes++ | ||
} | ||
|
||
if successes < d.cfg.MinWriteSuccesses { | ||
return fmt.Errorf("too few successful writes, last error was: %v", lastErr) | ||
for i := range sampleTrackers { | ||
if sampleTrackers[i].succeeded < int32(sampleTrackers[i].minSuccess) { | ||
return fmt.Errorf("need %d successful writes, only got %d, last error was: %v", | ||
sampleTrackers[i].minSuccess, sampleTrackers[i].succeeded, lastErr) | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func (d *Distributor) sendSamples(ctx context.Context, hostname string, samples []*model.Sample) error { | ||
func (d *Distributor) sendSamples(ctx context.Context, hostname string, sampleTrackers []*sampleTracker) error { | ||
client, err := d.getClientFor(hostname) | ||
if err != nil { | ||
return err | ||
} | ||
samples := make([]*model.Sample, len(sampleTrackers), len(sampleTrackers)) | ||
for i := range sampleTrackers { | ||
samples[i] = sampleTrackers[i].sample | ||
} | ||
err = instrument.TimeRequestHistogram("send", d.sendDuration, func() error { | ||
return client.Append(ctx, samples) | ||
}) | ||
if err != nil { | ||
d.ingesterAppendFailures.WithLabelValues(hostname).Inc() | ||
} | ||
d.ingesterAppends.WithLabelValues(hostname).Inc() | ||
for i := range sampleTrackers { | ||
atomic.AddInt32(&sampleTrackers[i].succeeded, 1) | ||
} | ||
return err | ||
} | ||
|
||
|
@@ -241,7 +282,7 @@ func (d *Distributor) Query(ctx context.Context, from, to model.Time, matchers . | |
return err | ||
} | ||
|
||
ingesters, err := d.cfg.Ring.Get(tokenFor(userID, metricName), d.cfg.ReplicationFactor) | ||
ingesters, err := d.cfg.Ring.Get(tokenFor(userID, metricName), d.cfg.ReplicationFactor, ring.Read) | ||
if err != nil { | ||
return err | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you move this
defer
to before the prometheus metric registration? I know it's extremely unlikely thatMustRegister
will fail, but if it does, we will register the ingester without ever unregistering it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure!