Skip to content

Commit 6b8bd5a

Browse files
authored
Allow disabling of ring heartbeats by setting relevant options to zero. (#4344)
* Allow disabling of ring heartbeats by setting relevant options to zero. Signed-off-by: Steve Simpson <[email protected]> * Review comments. Signed-off-by: Steve Simpson <[email protected]>
1 parent 04a8c99 commit 6b8bd5a

File tree

14 files changed

+85
-30
lines changed

14 files changed

+85
-30
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@
1414
* `-alertmanager.sharding-ring.heartbeat-timeout`
1515
* `-compactor.ring.heartbeat-timeout`
1616
* `-store-gateway.sharding-ring.heartbeat-timeout`
17+
* [ENHANCEMENT] Ring: allow heartbeats to be explicitly disabled by setting the interval to zero. This is considered experimental. This applies to the following configuration options: #4344
18+
* `-distributor.ring.heartbeat-period`
19+
* `-ingester.heartbeat-period`
20+
* `-ruler.ring.heartbeat-period`
21+
* `-alertmanager.sharding-ring.heartbeat-period`
22+
* `-compactor.ring.heartbeat-period`
23+
* `-store-gateway.sharding-ring.heartbeat-period`
1724
* [ENHANCEMENT] Memberlist: optimized receive path for processing ring state updates, to help reduce CPU utilization in large clusters. #4345
1825
* [BUGFIX] HA Tracker: when cleaning up obsolete elected replicas from KV store, tracker didn't update number of cluster per user correctly. #4336
1926

docs/blocks-storage/compactor.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ compactor:
209209
# CLI flag: -compactor.ring.multi.mirror-timeout
210210
[mirror_timeout: <duration> | default = 2s]
211211

212-
# Period at which to heartbeat to the ring.
212+
# Period at which to heartbeat to the ring. 0 = disabled.
213213
# CLI flag: -compactor.ring.heartbeat-period
214214
[heartbeat_period: <duration> | default = 5s]
215215

docs/blocks-storage/store-gateway.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ store_gateway:
232232
# CLI flag: -store-gateway.sharding-ring.multi.mirror-timeout
233233
[mirror_timeout: <duration> | default = 2s]
234234

235-
# Period at which to heartbeat to the ring.
235+
# Period at which to heartbeat to the ring. 0 = disabled.
236236
# CLI flag: -store-gateway.sharding-ring.heartbeat-period
237237
[heartbeat_period: <duration> | default = 15s]
238238

docs/configuration/config-file-reference.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ ring:
563563
# CLI flag: -distributor.ring.multi.mirror-timeout
564564
[mirror_timeout: <duration> | default = 2s]
565565
566-
# Period at which to heartbeat to the ring.
566+
# Period at which to heartbeat to the ring. 0 = disabled.
567567
# CLI flag: -distributor.ring.heartbeat-period
568568
[heartbeat_period: <duration> | default = 5s]
569569
@@ -679,7 +679,7 @@ lifecycler:
679679
# CLI flag: -ingester.num-tokens
680680
[num_tokens: <int> | default = 128]
681681
682-
# Period at which to heartbeat to consul.
682+
# Period at which to heartbeat to consul. 0 = disabled.
683683
# CLI flag: -ingester.heartbeat-period
684684
[heartbeat_period: <duration> | default = 5s]
685685
@@ -1581,7 +1581,7 @@ ring:
15811581
# CLI flag: -ruler.ring.multi.mirror-timeout
15821582
[mirror_timeout: <duration> | default = 2s]
15831583

1584-
# Period at which to heartbeat to the ring.
1584+
# Period at which to heartbeat to the ring. 0 = disabled.
15851585
# CLI flag: -ruler.ring.heartbeat-period
15861586
[heartbeat_period: <duration> | default = 5s]
15871587

@@ -1907,7 +1907,7 @@ sharding_ring:
19071907
# CLI flag: -alertmanager.sharding-ring.multi.mirror-timeout
19081908
[mirror_timeout: <duration> | default = 2s]
19091909
1910-
# Period at which to heartbeat to the ring.
1910+
# Period at which to heartbeat to the ring. 0 = disabled.
19111911
# CLI flag: -alertmanager.sharding-ring.heartbeat-period
19121912
[heartbeat_period: <duration> | default = 15s]
19131913
@@ -5179,7 +5179,7 @@ sharding_ring:
51795179
# CLI flag: -compactor.ring.multi.mirror-timeout
51805180
[mirror_timeout: <duration> | default = 2s]
51815181
5182-
# Period at which to heartbeat to the ring.
5182+
# Period at which to heartbeat to the ring. 0 = disabled.
51835183
# CLI flag: -compactor.ring.heartbeat-period
51845184
[heartbeat_period: <duration> | default = 5s]
51855185
@@ -5257,7 +5257,7 @@ sharding_ring:
52575257
# CLI flag: -store-gateway.sharding-ring.multi.mirror-timeout
52585258
[mirror_timeout: <duration> | default = 2s]
52595259
5260-
# Period at which to heartbeat to the ring.
5260+
# Period at which to heartbeat to the ring. 0 = disabled.
52615261
# CLI flag: -store-gateway.sharding-ring.heartbeat-period
52625262
[heartbeat_period: <duration> | default = 15s]
52635263

docs/configuration/v1-guarantees.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,11 @@ Currently experimental features are:
8787
- `-ruler.ring.heartbeat-timeout=0`
8888
- `-alertmanager.sharding-ring.heartbeat-timeout=0`
8989
- `-compactor.ring.heartbeat-timeout=0`
90-
- `-store-gateway.sharding-ring.heartbeat-timeout=0`
90+
- `-store-gateway.sharding-ring.heartbeat-timeout=0`
91+
- Disabling ring heartbeats
92+
- `-distributor.ring.heartbeat-period=0`
93+
- `-ingester.heartbeat-period=0`
94+
- `-ruler.ring.heartbeat-period=0`
95+
- `-alertmanager.sharding-ring.heartbeat-period=0`
96+
- `-compactor.ring.heartbeat-period=0`
97+
- `-store-gateway.sharding-ring.heartbeat-period=0`

pkg/alertmanager/alertmanager_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
7676

7777
// Ring flags
7878
cfg.KVStore.RegisterFlagsWithPrefix(rfprefix, "alertmanagers/", f)
79-
f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.")
79+
f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
8080
f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring. 0 = never (timeout disabled).")
8181
f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.")
8282
f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.")

pkg/compactor/compactor_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
5050

5151
// Ring flags
5252
cfg.KVStore.RegisterFlagsWithPrefix("compactor.ring.", "collectors/", f)
53-
f.DurationVar(&cfg.HeartbeatPeriod, "compactor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.")
53+
f.DurationVar(&cfg.HeartbeatPeriod, "compactor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
5454
f.DurationVar(&cfg.HeartbeatTimeout, "compactor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which compactors are considered unhealthy within the ring. 0 = never (timeout disabled).")
5555

5656
// Wait stability flags.

pkg/distributor/distributor_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
4242

4343
// Ring flags
4444
cfg.KVStore.RegisterFlagsWithPrefix("distributor.ring.", "collectors/", f)
45-
f.DurationVar(&cfg.HeartbeatPeriod, "distributor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.")
45+
f.DurationVar(&cfg.HeartbeatPeriod, "distributor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
4646
f.DurationVar(&cfg.HeartbeatTimeout, "distributor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which distributors are considered unhealthy within the ring. 0 = never (timeout disabled).")
4747

4848
// Instance flags

pkg/ring/basic_lifecycler.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/prometheus/client_golang/prometheus"
1414

1515
"github.com/cortexproject/cortex/pkg/ring/kv"
16+
"github.com/cortexproject/cortex/pkg/util"
1617
util_log "github.com/cortexproject/cortex/pkg/util/log"
1718
"github.com/cortexproject/cortex/pkg/util/services"
1819
)
@@ -182,12 +183,12 @@ func (l *BasicLifecycler) starting(ctx context.Context) error {
182183
}
183184

184185
func (l *BasicLifecycler) running(ctx context.Context) error {
185-
heartbeatTicker := time.NewTicker(l.cfg.HeartbeatPeriod)
186-
defer heartbeatTicker.Stop()
186+
heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod)
187+
defer heartbeatTickerStop()
187188

188189
for {
189190
select {
190-
case <-heartbeatTicker.C:
191+
case <-heartbeatTickerChan:
191192
l.heartbeat(ctx)
192193

193194
case f := <-l.actorChan:
@@ -214,13 +215,13 @@ func (l *BasicLifecycler) stopping(runningError error) error {
214215
}()
215216

216217
// Heartbeat while the stopping delegate function is running.
217-
heartbeatTicker := time.NewTicker(l.cfg.HeartbeatPeriod)
218-
defer heartbeatTicker.Stop()
218+
heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod)
219+
defer heartbeatTickerStop()
219220

220221
heartbeatLoop:
221222
for {
222223
select {
223-
case <-heartbeatTicker.C:
224+
case <-heartbeatTickerChan:
224225
l.heartbeat(context.Background())
225226
case <-done:
226227
break heartbeatLoop
@@ -292,8 +293,8 @@ func (l *BasicLifecycler) registerInstance(ctx context.Context) error {
292293
}
293294

294295
func (l *BasicLifecycler) waitStableTokens(ctx context.Context, period time.Duration) error {
295-
heartbeatTicker := time.NewTicker(l.cfg.HeartbeatPeriod)
296-
defer heartbeatTicker.Stop()
296+
heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod)
297+
defer heartbeatTickerStop()
297298

298299
// The first observation will occur after the specified period.
299300
level.Info(l.logger).Log("msg", "waiting stable tokens", "ring", l.ringName)
@@ -312,7 +313,7 @@ func (l *BasicLifecycler) waitStableTokens(ctx context.Context, period time.Dura
312313
level.Info(l.logger).Log("msg", "tokens verification succeeded", "ring", l.ringName)
313314
return nil
314315

315-
case <-heartbeatTicker.C:
316+
case <-heartbeatTickerChan:
316317
l.heartbeat(ctx)
317318

318319
case <-ctx.Done():

pkg/ring/lifecycler.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"go.uber.org/atomic"
1818

1919
"github.com/cortexproject/cortex/pkg/ring/kv"
20+
"github.com/cortexproject/cortex/pkg/util"
2021
"github.com/cortexproject/cortex/pkg/util/flagext"
2122
"github.com/cortexproject/cortex/pkg/util/log"
2223
"github.com/cortexproject/cortex/pkg/util/services"
@@ -83,7 +84,7 @@ func (cfg *LifecyclerConfig) RegisterFlagsWithPrefix(prefix string, f *flag.Flag
8384
}
8485

8586
f.IntVar(&cfg.NumTokens, prefix+"num-tokens", 128, "Number of tokens for each ingester.")
86-
f.DurationVar(&cfg.HeartbeatPeriod, prefix+"heartbeat-period", 5*time.Second, "Period at which to heartbeat to consul.")
87+
f.DurationVar(&cfg.HeartbeatPeriod, prefix+"heartbeat-period", 5*time.Second, "Period at which to heartbeat to consul. 0 = disabled.")
8788
f.DurationVar(&cfg.JoinAfter, prefix+"join-after", 0*time.Second, "Period to wait for a claim from another member; will join automatically after this.")
8889
f.DurationVar(&cfg.ObservePeriod, prefix+"observe-period", 0*time.Second, "Observe tokens after generating to resolve collisions. Useful when using gossiping ring.")
8990
f.DurationVar(&cfg.MinReadyDuration, prefix+"min-ready-duration", 1*time.Minute, "Minimum duration to wait before becoming ready. This is to work around race conditions with ingesters exiting and updating the ring.")
@@ -392,8 +393,8 @@ func (i *Lifecycler) loop(ctx context.Context) error {
392393
autoJoinAfter := time.After(i.cfg.JoinAfter)
393394
var observeChan <-chan time.Time = nil
394395

395-
heartbeatTicker := time.NewTicker(i.cfg.HeartbeatPeriod)
396-
defer heartbeatTicker.Stop()
396+
heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(i.cfg.HeartbeatPeriod)
397+
defer heartbeatTickerStop()
397398

398399
for {
399400
select {
@@ -442,7 +443,7 @@ func (i *Lifecycler) loop(ctx context.Context) error {
442443
observeChan = time.After(i.cfg.ObservePeriod)
443444
}
444445

445-
case <-heartbeatTicker.C:
446+
case <-heartbeatTickerChan:
446447
consulHeartbeats.WithLabelValues(i.RingName).Inc()
447448
if err := i.updateConsul(context.Background()); err != nil {
448449
level.Error(log.Logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err)
@@ -469,8 +470,8 @@ func (i *Lifecycler) stopping(runningError error) error {
469470
return nil
470471
}
471472

472-
heartbeatTicker := time.NewTicker(i.cfg.HeartbeatPeriod)
473-
defer heartbeatTicker.Stop()
473+
heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(i.cfg.HeartbeatPeriod)
474+
defer heartbeatTickerStop()
474475

475476
// Mark ourselved as Leaving so no more samples are send to us.
476477
err := i.changeState(context.Background(), LEAVING)
@@ -489,7 +490,7 @@ func (i *Lifecycler) stopping(runningError error) error {
489490
heartbeatLoop:
490491
for {
491492
select {
492-
case <-heartbeatTicker.C:
493+
case <-heartbeatTickerChan:
493494
consulHeartbeats.WithLabelValues(i.RingName).Inc()
494495
if err := i.updateConsul(context.Background()); err != nil {
495496
level.Error(log.Logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err)

pkg/ruler/ruler_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
5656

5757
// Ring flags
5858
cfg.KVStore.RegisterFlagsWithPrefix("ruler.ring.", "rulers/", f)
59-
f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.")
59+
f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
6060
f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring. 0 = never (timeout disabled).")
6161

6262
// Instance flags

pkg/storegateway/gateway_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
9494

9595
// Ring flags
9696
cfg.KVStore.RegisterFlagsWithPrefix(ringFlagsPrefix, "collectors/", f)
97-
f.DurationVar(&cfg.HeartbeatPeriod, ringFlagsPrefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.")
97+
f.DurationVar(&cfg.HeartbeatPeriod, ringFlagsPrefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
9898
f.DurationVar(&cfg.HeartbeatTimeout, ringFlagsPrefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which store gateways are considered unhealthy within the ring. 0 = never (timeout disabled)."+sharedOptionWithQuerier)
9999
f.IntVar(&cfg.ReplicationFactor, ringFlagsPrefix+"replication-factor", 3, "The replication factor to use when sharding blocks."+sharedOptionWithQuerier)
100100
f.StringVar(&cfg.TokensFilePath, ringFlagsPrefix+"tokens-file-path", "", "File path where tokens are stored. If empty, tokens are not stored at shutdown and restored at startup.")

pkg/util/time.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,14 @@ func DurationWithPositiveJitter(input time.Duration, variancePerc float64) time.
7373

7474
return input + time.Duration(jitter)
7575
}
76+
77+
// NewDisableableTicker essentially wraps NewTicker but allows the ticker to be disabled by passing
78+
// zero duration as the interval. Returns a function for stopping the ticker, and the ticker channel.
79+
func NewDisableableTicker(interval time.Duration) (func(), <-chan time.Time) {
80+
if interval == 0 {
81+
return func() {}, nil
82+
}
83+
84+
tick := time.NewTicker(interval)
85+
return func() { tick.Stop() }, tick.C
86+
}

pkg/util/time_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,31 @@ func TestParseTime(t *testing.T) {
103103
assert.Equal(t, TimeToMillis(test.result), ts)
104104
}
105105
}
106+
107+
func TestNewDisableableTicker_Enabled(t *testing.T) {
108+
stop, ch := NewDisableableTicker(10 * time.Millisecond)
109+
defer stop()
110+
111+
time.Sleep(100 * time.Millisecond)
112+
113+
select {
114+
case <-ch:
115+
break
116+
default:
117+
t.Error("ticker should have ticked when enabled")
118+
}
119+
}
120+
121+
func TestNewDisableableTicker_Disabled(t *testing.T) {
122+
stop, ch := NewDisableableTicker(0)
123+
defer stop()
124+
125+
time.Sleep(100 * time.Millisecond)
126+
127+
select {
128+
case <-ch:
129+
t.Error("ticker should not have ticked when disabled")
130+
default:
131+
break
132+
}
133+
}

0 commit comments

Comments
 (0)