Skip to content

Fixed flaky TestMultitenantAlertmanager_SyncOnRingTopologyChanges #4418

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,10 @@ type MultitenantAlertmanager struct {
distributor *Distributor
grpcServer *server.Server

// Last ring state. This variable is not protected with a mutex because it's always
// accessed by a single goroutine at a time.
ringLastState ring.ReplicationSet

// Subservices manager (ring, lifecycler)
subservices *services.Manager
subservicesWatcher *services.FailureWatcher
Expand Down Expand Up @@ -488,6 +492,10 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {
}

if am.cfg.ShardingEnabled {
// Store the ring state after the initial Alertmanager configs sync has been done and before we do change
// our state in the ring.
am.ringLastState, _ = am.ring.GetAllHealthy(RingOp)

// Make sure that all the alertmanagers we were initially configured with have
// fetched state from the replicas, before advertising as ACTIVE. This will
// reduce the possibility that we lose state when new instances join/leave.
Expand Down Expand Up @@ -631,10 +639,8 @@ func (am *MultitenantAlertmanager) run(ctx context.Context) error {
defer tick.Stop()

var ringTickerChan <-chan time.Time
var ringLastState ring.ReplicationSet

if am.cfg.ShardingEnabled {
ringLastState, _ = am.ring.GetAllHealthy(RingOp)
ringTicker := time.NewTicker(util.DurationWithJitter(am.cfg.ShardingRing.RingCheckPeriod, 0.2))
defer ringTicker.Stop()
ringTickerChan = ringTicker.C
Expand All @@ -656,8 +662,8 @@ func (am *MultitenantAlertmanager) run(ctx context.Context) error {
// replication set which we use to compare with the previous state.
currRingState, _ := am.ring.GetAllHealthy(RingOp)

if ring.HasReplicationSetChanged(ringLastState, currRingState) {
ringLastState = currRingState
if ring.HasReplicationSetChanged(am.ringLastState, currRingState) {
am.ringLastState = currRingState
if err := am.loadAndSyncConfigs(ctx, reasonRingChange); err != nil {
level.Warn(am.logger).Log("msg", "error while synchronizing alertmanager configs", "err", err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1330,7 +1330,7 @@ func TestMultitenantAlertmanager_SyncOnRingTopologyChanges(t *testing.T) {
if tt.expected {
expectedSyncs++
}
test.Poll(t, 5*time.Second, float64(expectedSyncs), func() interface{} {
test.Poll(t, 3*time.Second, float64(expectedSyncs), func() interface{} {
metrics := regs.BuildMetricFamiliesPerUser()
return metrics.GetSumOfCounters("cortex_alertmanager_sync_configs_total")
})
Expand Down