@@ -16,6 +16,7 @@ import (
16
16
"google.golang.org/grpc"
17
17
18
18
"github.com/prometheus/client_golang/prometheus"
19
+ "github.com/prometheus/common/log"
19
20
"github.com/prometheus/common/model"
20
21
"github.com/prometheus/prometheus/storage/metric"
21
22
"github.com/prometheus/prometheus/storage/remote"
@@ -41,7 +42,9 @@ type Distributor struct {
41
42
cfg Config
42
43
ring ReadRing
43
44
clientsMtx sync.RWMutex
44
- clients map [string ]cortex.IngesterClient
45
+ clients map [string ]ingesterClient
46
+ quit chan struct {}
47
+ done chan struct {}
45
48
46
49
queryDuration * prometheus.HistogramVec
47
50
receivedSamples prometheus.Counter
@@ -52,6 +55,11 @@ type Distributor struct {
52
55
ingesterQueryFailures * prometheus.CounterVec
53
56
}
54
57
58
+ type ingesterClient struct {
59
+ cortex.IngesterClient
60
+ conn * grpc.ClientConn
61
+ }
62
+
55
63
// ReadRing represents the read inferface to the ring.
56
64
type ReadRing interface {
57
65
prometheus.Collector
@@ -64,10 +72,11 @@ type ReadRing interface {
64
72
// Config contains the configuration require to
65
73
// create a Distributor
66
74
type Config struct {
67
- ReplicationFactor int
68
- MinReadSuccesses int
69
- HeartbeatTimeout time.Duration
70
- RemoteTimeout time.Duration
75
+ ReplicationFactor int
76
+ MinReadSuccesses int
77
+ HeartbeatTimeout time.Duration
78
+ RemoteTimeout time.Duration
79
+ ClientCleanupPeriod time.Duration
71
80
}
72
81
73
82
// RegisterFlags adds the flags required to config this to the given FlagSet
@@ -76,6 +85,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
76
85
flag .IntVar (& cfg .MinReadSuccesses , "distributor.min-read-successes" , 2 , "The minimum number of ingesters from which a read must succeed." )
77
86
flag .DurationVar (& cfg .HeartbeatTimeout , "distributor.heartbeat-timeout" , time .Minute , "The heartbeat timeout after which ingesters are skipped for reads/writes." )
78
87
flag .DurationVar (& cfg .RemoteTimeout , "distributor.remote-timeout" , 5 * time .Second , "Timeout for downstream ingesters." )
88
+ flag .DurationVar (& cfg .ClientCleanupPeriod , "distributor.client-cleanup-period" , 15 * time .Second , "How frequently to clean up clients for ingesters that have gone away." )
79
89
}
80
90
81
91
// New constructs a new Distributor
@@ -89,7 +99,9 @@ func New(cfg Config, ring ReadRing) (*Distributor, error) {
89
99
d := & Distributor {
90
100
cfg : cfg ,
91
101
ring : ring ,
92
- clients : map [string ]cortex.IngesterClient {},
102
+ clients : map [string ]ingesterClient {},
103
+ quit : make (chan struct {}),
104
+ done : make (chan struct {}),
93
105
queryDuration : prometheus .NewHistogramVec (prometheus.HistogramOpts {
94
106
Namespace : "cortex" ,
95
107
Name : "distributor_query_duration_seconds" ,
@@ -129,9 +141,54 @@ func New(cfg Config, ring ReadRing) (*Distributor, error) {
129
141
}, []string {"ingester" }),
130
142
}
131
143
prometheus .MustRegister (d )
144
+ go d .Run ()
132
145
return d , nil
133
146
}
134
147
148
+ // Run starts the distributor's maintenance loop.
149
+ func (d * Distributor ) Run () {
150
+ cleanupClients := time .NewTicker (d .cfg .ClientCleanupPeriod )
151
+ for {
152
+ select {
153
+ case <- cleanupClients .C :
154
+ d .removeStaleIngesterClients ()
155
+ case <- d .quit :
156
+ close (d .done )
157
+ return
158
+ }
159
+ }
160
+ }
161
+
162
+ // Stop stops the distributor's maintenance loop.
163
+ func (d * Distributor ) Stop () {
164
+ close (d .quit )
165
+ <- d .done
166
+ }
167
+
168
+ func (d * Distributor ) removeStaleIngesterClients () {
169
+ d .clientsMtx .Lock ()
170
+ defer d .clientsMtx .Unlock ()
171
+
172
+ ingesters := map [string ]struct {}{}
173
+ for _ , ing := range d .ring .GetAll () {
174
+ ingesters [ing .Addr ] = struct {}{}
175
+ }
176
+
177
+ for addr , client := range d .clients {
178
+ if _ , ok := ingesters [addr ]; ! ok {
179
+ log .Info ("Removing stale ingester client for " , addr )
180
+ delete (d .clients , addr )
181
+ // Do the gRPC closing in the background since it might take a while and
182
+ // we're holding a mutex.
183
+ go func () {
184
+ if err := client .conn .Close (); err != nil {
185
+ log .Errorf ("Error closing connection to ingester %q: %v" , addr , err )
186
+ }
187
+ }()
188
+ }
189
+ }
190
+ }
191
+
135
192
func (d * Distributor ) getClientFor (ingester * ring.IngesterDesc ) (cortex.IngesterClient , error ) {
136
193
d .clientsMtx .RLock ()
137
194
client , ok := d .clients [ingester .Addr ]
@@ -159,7 +216,10 @@ func (d *Distributor) getClientFor(ingester *ring.IngesterDesc) (cortex.Ingester
159
216
return nil , err
160
217
}
161
218
162
- client = cortex .NewIngesterClient (conn )
219
+ client = ingesterClient {
220
+ IngesterClient : cortex .NewIngesterClient (conn ),
221
+ conn : conn ,
222
+ }
163
223
d .clients [ingester .Addr ] = client
164
224
return client , nil
165
225
}
0 commit comments