4
4
"fmt"
5
5
"hash/fnv"
6
6
"sync"
7
+ "sync/atomic"
7
8
"time"
8
9
9
10
"github.com/prometheus/client_golang/prometheus"
@@ -46,7 +47,8 @@ type Distributor struct {
46
47
type ReadRing interface {
47
48
prometheus.Collector
48
49
49
- Get (key uint32 , n int ) ([]ring.IngesterDesc , error )
50
+ Get (key uint32 , n int , op ring.Operation ) ([]ring.IngesterDesc , error )
51
+ BatchGet (keys []uint32 , n int , op ring.Operation ) ([][]ring.IngesterDesc , error )
50
52
GetAll () []ring.IngesterDesc
51
53
}
52
54
@@ -61,7 +63,6 @@ type DistributorConfig struct {
61
63
62
64
ReplicationFactor int
63
65
MinReadSuccesses int
64
- MinWriteSuccesses int
65
66
HeartbeatTimeout time.Duration
66
67
}
67
68
@@ -70,9 +71,6 @@ func NewDistributor(cfg DistributorConfig) (*Distributor, error) {
70
71
if 0 > cfg .ReplicationFactor {
71
72
return nil , fmt .Errorf ("ReplicationFactor must be greater than zero: %d" , cfg .ReplicationFactor )
72
73
}
73
- if cfg .MinWriteSuccesses > cfg .ReplicationFactor {
74
- return nil , fmt .Errorf ("MinWriteSuccesses > ReplicationFactor: %d > %d" , cfg .MinWriteSuccesses , cfg .ReplicationFactor )
75
- }
76
74
if cfg .MinReadSuccesses > cfg .ReplicationFactor {
77
75
return nil , fmt .Errorf ("MinReadSuccesses > ReplicationFactor: %d > %d" , cfg .MinReadSuccesses , cfg .ReplicationFactor )
78
76
}
@@ -154,6 +152,12 @@ func tokenFor(userID string, name model.LabelValue) uint32 {
154
152
return h .Sum32 ()
155
153
}
156
154
155
+ type sampleTracker struct {
156
+ sample * model.Sample
157
+ minSuccess int
158
+ succeeded int32
159
+ }
160
+
157
161
// Append implements SampleAppender.
158
162
func (d * Distributor ) Append (ctx context.Context , samples []* model.Sample ) error {
159
163
userID , err := user .GetID (ctx )
@@ -163,53 +167,90 @@ func (d *Distributor) Append(ctx context.Context, samples []*model.Sample) error
163
167
164
168
d .receivedSamples .Add (float64 (len (samples )))
165
169
166
- samplesByIngester := map [string ][]* model.Sample {}
167
- for _ , sample := range samples {
168
- key := tokenForMetric (userID , sample .Metric )
169
- ingesters , err := d .cfg .Ring .Get (key , d .cfg .ReplicationFactor )
170
- if err != nil {
171
- return err
170
+ keys := make ([]uint32 , len (samples ), len (samples ))
171
+ for i , sample := range samples {
172
+ keys [i ] = tokenForMetric (userID , sample .Metric )
173
+ }
174
+
175
+ ingesters , err := d .cfg .Ring .BatchGet (keys , d .cfg .ReplicationFactor , ring .Write )
176
+ if err != nil {
177
+ return err
178
+ }
179
+
180
+ sampleTrackers := make ([]sampleTracker , len (samples ), len (samples ))
181
+ samplesByIngester := map [string ][]* sampleTracker {}
182
+ for i := range samples {
183
+ sampleTrackers [i ] = sampleTracker {
184
+ sample : samples [i ],
185
+ // We need a response from a quorum of ingesters, which is n/2 + 1.
186
+ minSuccess : (len (ingesters [i ]) / 2 ) + 1 ,
187
+ succeeded : 0 ,
172
188
}
173
- for _ , ingester := range ingesters {
174
- otherSamples := samplesByIngester [ingester .Hostname ]
175
- samplesByIngester [ingester .Hostname ] = append (otherSamples , sample )
189
+
190
+ // Skip those that have not heartbeated in a while. NB these are still
191
+ // included in the calculation of minSuccess, so if too many failed ingesters
192
+ // will cause the whole write to fail.
193
+ liveIngesters := make ([]string , 0 , len (ingesters [i ]))
194
+ for _ , ingester := range ingesters [i ] {
195
+ if time .Now ().Sub (ingester .Timestamp ) <= d .cfg .HeartbeatTimeout {
196
+ liveIngesters = append (liveIngesters , ingester .Hostname )
197
+ }
198
+ }
199
+
200
+ // This is just a shortcut - if there are not minSuccess available ingesters,
201
+ // after filtering out dead ones, don't even both trying.
202
+ if len (liveIngesters ) < sampleTrackers [i ].minSuccess {
203
+ return fmt .Errorf ("wanted at least %d live ingesters to process write, had %d" ,
204
+ sampleTrackers [i ].minSuccess , len (liveIngesters ))
205
+ }
206
+
207
+ for _ , liveIngester := range liveIngesters {
208
+ sampleForIngester := samplesByIngester [liveIngester ]
209
+ samplesByIngester [liveIngester ] = append (sampleForIngester , & sampleTrackers [i ])
176
210
}
177
211
}
178
212
179
213
errs := make (chan error )
180
214
for hostname , samples := range samplesByIngester {
181
- go func (hostname string , samples []* model. Sample ) {
215
+ go func (hostname string , samples []* sampleTracker ) {
182
216
errs <- d .sendSamples (ctx , hostname , samples )
183
217
}(hostname , samples )
184
218
}
185
219
var lastErr error
186
- successes := 0
187
220
for i := 0 ; i < len (samplesByIngester ); i ++ {
188
221
if err := <- errs ; err != nil {
189
222
lastErr = err
190
223
continue
191
224
}
192
- successes ++
193
225
}
194
-
195
- if successes < d .cfg .MinWriteSuccesses {
196
- return fmt .Errorf ("too few successful writes, last error was: %v" , lastErr )
226
+ for i := range sampleTrackers {
227
+ if sampleTrackers [i ].succeeded < int32 (sampleTrackers [i ].minSuccess ) {
228
+ return fmt .Errorf ("need %d successful writes, only got %d, last error was: %v" ,
229
+ sampleTrackers [i ].minSuccess , sampleTrackers [i ].succeeded , lastErr )
230
+ }
197
231
}
198
232
return nil
199
233
}
200
234
201
- func (d * Distributor ) sendSamples (ctx context.Context , hostname string , samples []* model. Sample ) error {
235
+ func (d * Distributor ) sendSamples (ctx context.Context , hostname string , sampleTrackers []* sampleTracker ) error {
202
236
client , err := d .getClientFor (hostname )
203
237
if err != nil {
204
238
return err
205
239
}
240
+ samples := make ([]* model.Sample , len (sampleTrackers ), len (sampleTrackers ))
241
+ for i := range sampleTrackers {
242
+ samples [i ] = sampleTrackers [i ].sample
243
+ }
206
244
err = instrument .TimeRequestHistogram ("send" , d .sendDuration , func () error {
207
245
return client .Append (ctx , samples )
208
246
})
209
247
if err != nil {
210
248
d .ingesterAppendFailures .WithLabelValues (hostname ).Inc ()
211
249
}
212
250
d .ingesterAppends .WithLabelValues (hostname ).Inc ()
251
+ for i := range sampleTrackers {
252
+ atomic .AddInt32 (& sampleTrackers [i ].succeeded , 1 )
253
+ }
213
254
return err
214
255
}
215
256
@@ -241,7 +282,7 @@ func (d *Distributor) Query(ctx context.Context, from, to model.Time, matchers .
241
282
return err
242
283
}
243
284
244
- ingesters , err := d .cfg .Ring .Get (tokenFor (userID , metricName ), d .cfg .ReplicationFactor )
285
+ ingesters , err := d .cfg .Ring .Get (tokenFor (userID , metricName ), d .cfg .ReplicationFactor , ring . Read )
245
286
if err != nil {
246
287
return err
247
288
}
0 commit comments