Skip to content

Commit ba5a005

Browse files
friedrichgalexqyle
authored andcommitted
Support enabled_tenants and disabled_tenants in alertmanager (cortexproject#5116)
* Support enabled_tenants and disabled_tenants in alertmanager Signed-off-by: Friedrich Gonzalez <[email protected]> * Add PR number Signed-off-by: Friedrich Gonzalez <[email protected]> * Disable alertmanager UI and API for tenants disabled Signed-off-by: Friedrich Gonzalez <[email protected]> --------- Signed-off-by: Friedrich Gonzalez <[email protected]> Signed-off-by: Alex Le <[email protected]>
1 parent 1bfd076 commit ba5a005

File tree

6 files changed

+110
-3
lines changed

6 files changed

+110
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* [ENHANCEMENT] Ingester: The metadata APIs should honour `querier.query-ingesters-within` when `querier.query-store-for-labels-enabled` is true. #5027
1414
* [ENHANCEMENT] Query Frontend: Skip instant query roundtripper if sharding is not applicable. #5062
1515
* [ENHANCEMENT] Push reduce one hash operation of Labels. #4945 #5114
16+
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.enabled-tenants` and `-alertmanager.disabled-tenants` to explicitly enable or disable alertmanager for specific tenants. #5116
1617
* [FEATURE] Querier/Query Frontend: support Prometheus /api/v1/status/buildinfo API. #4978
1718
* [FEATURE] Ingester: Add active series to all_user_stats page. #4972
1819
* [FEATURE] Ingester: Added `-blocks-storage.tsdb.head-chunks-write-queue-size` allowing to configure the size of the in-memory queue used before flushing chunks to the disk . #5000

docs/configuration/config-file-reference.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,6 +1886,18 @@ alertmanager_client:
18861886
# result in potentially fewer lost silences, and fewer duplicate notifications.
18871887
# CLI flag: -alertmanager.persist-interval
18881888
[persist_interval: <duration> | default = 15m]
1889+
1890+
# Comma separated list of tenants whose alerts this alertmanager can process. If
1891+
# specified, only these tenants will be handled by alertmanager, otherwise this
1892+
# alertmanager can process alerts from all tenants.
1893+
# CLI flag: -alertmanager.enabled-tenants
1894+
[enabled_tenants: <string> | default = ""]
1895+
1896+
# Comma separated list of tenants whose alerts this alertmanager cannot process.
1897+
# If specified, a alertmanager that would normally pick the specified tenant(s)
1898+
# for processing will ignore them instead.
1899+
# CLI flag: -alertmanager.disabled-tenants
1900+
[disabled_tenants: <string> | default = ""]
18891901
```
18901902

18911903
### `alertmanager_storage_config`

pkg/alertmanager/distributor.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ func (d *Distributor) isUnaryReadPath(p string) bool {
118118
// In case of reads, it proxies the request to one of the alertmanagers.
119119
// DistributeRequest assumes that the caller has verified IsPathSupported returns
120120
// true for the route.
121-
func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request) {
121+
func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request, allowedTenants *util.AllowedTenants) {
122122
d.requestsInFlight.Add(1)
123123
defer d.requestsInFlight.Done()
124124

@@ -128,6 +128,11 @@ func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request)
128128
return
129129
}
130130

131+
if !allowedTenants.IsAllowed(userID) {
132+
http.Error(w, "Tenant is not allowed", http.StatusUnauthorized)
133+
return
134+
}
135+
131136
logger := util_log.WithContext(r.Context(), d.logger)
132137

133138
if r.Method == http.MethodPost {

pkg/alertmanager/distributor_test.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/cortexproject/cortex/pkg/ring"
2828
"github.com/cortexproject/cortex/pkg/ring/kv"
2929
"github.com/cortexproject/cortex/pkg/ring/kv/consul"
30+
"github.com/cortexproject/cortex/pkg/util"
3031
"github.com/cortexproject/cortex/pkg/util/flagext"
3132
util_log "github.com/cortexproject/cortex/pkg/util/log"
3233
"github.com/cortexproject/cortex/pkg/util/services"
@@ -40,6 +41,7 @@ func TestDistributor_DistributeRequest(t *testing.T) {
4041
replicationFactor int
4142
isRead bool
4243
isDelete bool
44+
isTenantDisabled bool
4345
expStatusCode int
4446
expectedTotalCalls int
4547
headersNotPreserved bool
@@ -56,6 +58,16 @@ func TestDistributor_DistributeRequest(t *testing.T) {
5658
expStatusCode: http.StatusOK,
5759
expectedTotalCalls: 3,
5860
route: "/alerts",
61+
}, {
62+
name: "Write /alerts, Simple AM request, all AM healthy, not allowed",
63+
numAM: 4,
64+
numHappyAM: 4,
65+
replicationFactor: 3,
66+
expStatusCode: http.StatusUnauthorized,
67+
expectedTotalCalls: 0,
68+
route: "/alerts",
69+
headersNotPreserved: true,
70+
isTenantDisabled: true,
5971
}, {
6072
name: "Write /alerts, Less than quorum AM available",
6173
numAM: 1,
@@ -262,9 +274,13 @@ func TestDistributor_DistributeRequest(t *testing.T) {
262274
req.Method = http.MethodDelete
263275
}
264276
req.RequestURI = url
277+
var allowedTenants *util.AllowedTenants
278+
if c.isTenantDisabled {
279+
allowedTenants = util.NewAllowedTenants(nil, []string{"1"})
280+
}
265281

266282
w := httptest.NewRecorder()
267-
d.DistributeRequest(w, req)
283+
d.DistributeRequest(w, req, allowedTenants)
268284
resp := w.Result()
269285
require.Equal(t, c.expStatusCode, resp.StatusCode)
270286

pkg/alertmanager/multitenant.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ type MultitenantAlertmanagerConfig struct {
8585

8686
// For the state persister.
8787
Persister PersisterConfig `yaml:",inline"`
88+
89+
EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
90+
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
8891
}
8992

9093
type ClusterConfig struct {
@@ -116,6 +119,8 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
116119
f.BoolVar(&cfg.EnableAPI, "experimental.alertmanager.enable-api", false, "Enable the experimental alertmanager config api.")
117120

118121
f.BoolVar(&cfg.ShardingEnabled, "alertmanager.sharding-enabled", false, "Shard tenants across multiple alertmanager instances.")
122+
f.Var(&cfg.EnabledTenants, "alertmanager.enabled-tenants", "Comma separated list of tenants whose alerts this alertmanager can process. If specified, only these tenants will be handled by alertmanager, otherwise this alertmanager can process alerts from all tenants.")
123+
f.Var(&cfg.DisabledTenants, "alertmanager.disabled-tenants", "Comma separated list of tenants whose alerts this alertmanager cannot process. If specified, a alertmanager that would normally pick the specified tenant(s) for processing will ignore them instead.")
119124

120125
cfg.AlertmanagerClient.RegisterFlagsWithPrefix("alertmanager.alertmanager-client", f)
121126
cfg.Persister.RegisterFlagsWithPrefix("alertmanager", f)
@@ -269,6 +274,8 @@ type MultitenantAlertmanager struct {
269274

270275
limits Limits
271276

277+
allowedTenants *util.AllowedTenants
278+
272279
registry prometheus.Registerer
273280
ringCheckErrors prometheus.Counter
274281
tenantsOwned prometheus.Gauge
@@ -359,6 +366,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
359366
logger: log.With(logger, "component", "MultiTenantAlertmanager"),
360367
registry: registerer,
361368
limits: limits,
369+
allowedTenants: util.NewAllowedTenants(cfg.EnabledTenants, cfg.DisabledTenants),
362370
ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
363371
Name: "cortex_alertmanager_ring_check_errors_total",
364372
Help: "Number of errors that have occurred when checking the ring for ownership.",
@@ -418,6 +426,13 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
418426
}
419427
}
420428

429+
if len(cfg.EnabledTenants) > 0 {
430+
level.Info(am.logger).Log("msg", "alertmanager using enabled users", "enabled", strings.Join(cfg.EnabledTenants, ", "))
431+
}
432+
if len(cfg.DisabledTenants) > 0 {
433+
level.Info(am.logger).Log("msg", "alertmanager using disabled users", "disabled", strings.Join(cfg.DisabledTenants, ", "))
434+
}
435+
421436
if registerer != nil {
422437
registerer.MustRegister(am.alertmanagerMetrics)
423438
}
@@ -735,6 +750,10 @@ func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context)
735750

736751
// Filter out users not owned by this shard.
737752
for _, userID := range allUserIDs {
753+
if !am.allowedTenants.IsAllowed(userID) {
754+
level.Debug(am.logger).Log("msg", "ignoring alertmanager for user, not allowed", "user", userID)
755+
continue
756+
}
738757
if am.isUserOwned(userID) {
739758
ownedUserIDs = append(ownedUserIDs, userID)
740759
}
@@ -993,7 +1012,7 @@ func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Re
9931012
}
9941013

9951014
if am.cfg.ShardingEnabled && am.distributor.IsPathSupported(req.URL.Path) {
996-
am.distributor.DistributeRequest(w, req)
1015+
am.distributor.DistributeRequest(w, req, am.allowedTenants)
9971016
return
9981017
}
9991018

@@ -1014,6 +1033,10 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http
10141033
http.Error(w, err.Error(), http.StatusUnauthorized)
10151034
return
10161035
}
1036+
if !am.allowedTenants.IsAllowed(userID) {
1037+
http.Error(w, "Tenant is not allowed", http.StatusUnauthorized)
1038+
return
1039+
}
10171040
am.alertmanagersMtx.Lock()
10181041
userAM, ok := am.alertmanagers[userID]
10191042
am.alertmanagersMtx.Unlock()
@@ -1197,6 +1220,10 @@ func (am *MultitenantAlertmanager) deleteUnusedRemoteUserState(ctx context.Conte
11971220
}
11981221

11991222
for _, userID := range usersWithState {
1223+
if !am.allowedTenants.IsAllowed(userID) {
1224+
level.Debug(am.logger).Log("msg", "not deleting remote state for user, not allowed", "user", userID)
1225+
continue
1226+
}
12001227
if _, ok := users[userID]; ok {
12011228
continue
12021229
}

pkg/alertmanager/multitenant_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,13 +1110,29 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
11101110
configs int
11111111
expectedTenants int
11121112
withSharding bool
1113+
enabledTenants []string
1114+
disabledTenants []string
11131115
}{
11141116
{
11151117
name: "sharding disabled, 1 instance",
11161118
instances: 1,
11171119
configs: 10,
11181120
expectedTenants: 10,
11191121
},
1122+
{
1123+
name: "sharding disabled, 1 instance, single user allowed",
1124+
instances: 1,
1125+
configs: 10,
1126+
expectedTenants: 1,
1127+
enabledTenants: []string{"u-1"},
1128+
},
1129+
{
1130+
name: "sharding disabled, 1 instance, single user disabled",
1131+
instances: 1,
1132+
configs: 10,
1133+
expectedTenants: 9,
1134+
disabledTenants: []string{"u-2"},
1135+
},
11201136
{
11211137
name: "sharding disabled, 2 instances",
11221138
instances: 2,
@@ -1131,6 +1147,24 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
11311147
configs: 10,
11321148
expectedTenants: 10, // same as no sharding and 1 instance
11331149
},
1150+
{
1151+
name: "sharding enabled, 1 instance, enabled tenants, single user allowed",
1152+
withSharding: true,
1153+
instances: 1,
1154+
replicationFactor: 1,
1155+
configs: 10,
1156+
expectedTenants: 1,
1157+
enabledTenants: []string{"u-3"},
1158+
},
1159+
{
1160+
name: "sharding enabled, 1 instance, enabled tenants, single user disabled",
1161+
withSharding: true,
1162+
instances: 1,
1163+
replicationFactor: 1,
1164+
configs: 10,
1165+
expectedTenants: 9,
1166+
disabledTenants: []string{"u-4"},
1167+
},
11341168
{
11351169
name: "sharding enabled, 2 instances, RF = 1",
11361170
withSharding: true,
@@ -1155,6 +1189,15 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
11551189
configs: 10,
11561190
expectedTenants: 30, // configs * replication factor
11571191
},
1192+
{
1193+
name: "sharding enabled, 5 instances, RF = 3, two users disabled",
1194+
withSharding: true,
1195+
instances: 5,
1196+
replicationFactor: 3,
1197+
configs: 10,
1198+
expectedTenants: 24, // (configs - disabled-tenants) * replication factor
1199+
disabledTenants: []string{"u-1", "u-2"},
1200+
},
11581201
}
11591202

11601203
for _, tt := range tc {
@@ -1192,6 +1235,9 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
11921235
amConfig.PollInterval = time.Hour
11931236
amConfig.ShardingRing.RingCheckPeriod = time.Hour
11941237

1238+
amConfig.EnabledTenants = tt.enabledTenants
1239+
amConfig.DisabledTenants = tt.disabledTenants
1240+
11951241
if tt.withSharding {
11961242
amConfig.ShardingEnabled = true
11971243
}

0 commit comments

Comments
 (0)