Skip to content

Commit 019a715

Browse files
authored
feat: add per-monitor configuration to selectively disable monitors (#50)
* feat: merge e2e-ci into e2e * add update-e2e-manifests to generate target * feat: support monitor specific configurations * update readme * address comments + make generate * improvements to logging and handle cases when configmap is deleted * run make generate
1 parent ebfcaa5 commit 019a715

9 files changed

Lines changed: 450 additions & 28 deletions

File tree

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,47 @@ It is recommended to install the EKS Node Health Monitoring Agent as an EKS add-
3636

3737
For detailed configuration options and usage documentation, refer to the [Amazon EKS Node Health documentation](https://docs.aws.amazon.com/eks/latest/userguide/node-health.html).
3838

39+
## Configuring Monitors
40+
41+
By default all monitors are enabled. Individual monitors can be disabled via the Helm chart's `nodeAgent.monitors` configuration or by providing a config file at `/etc/nma/config.yaml`.
42+
43+
### Helm Values
44+
45+
```yaml
46+
nodeAgent:
47+
monitors:
48+
networking:
49+
enabled: false
50+
neuron:
51+
enabled: false
52+
```
53+
54+
### Config File Format
55+
56+
The agent reads a YAML config file mounted at `/etc/nma/config.yaml`. Omitted monitors default to enabled.
57+
58+
```yaml
59+
monitors:
60+
kernel-monitor:
61+
enabled: true
62+
networking:
63+
enabled: false
64+
storage-monitor:
65+
enabled: true
66+
nvidia:
67+
enabled: true
68+
neuron:
69+
enabled: false
70+
runtime:
71+
enabled: true
72+
```
73+
74+
Valid plugin names: `kernel-monitor`, `networking`, `storage-monitor`, `nvidia`, `neuron`, `runtime`.
75+
76+
When a monitor is disabled:
77+
- Its health checks are not executed.
78+
- The corresponding `NodeCondition` (e.g., `NetworkingReady`) is not set on the node, avoiding false-positive healthy status for unmonitored subsystems.
79+
3980
## Building
4081

4182
```bash

charts/configuration.schema.json

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,31 @@
4747
},
4848
"tolerations": {
4949
"$ref": "#/definitions/Tolerations"
50+
},
51+
"monitors": {
52+
"type": "object",
53+
"description": "Per-monitor configuration keyed by plugin name",
54+
"additionalProperties": false,
55+
"properties": {
56+
"kernel-monitor": {
57+
"$ref": "#/definitions/MonitorSettings"
58+
},
59+
"networking": {
60+
"$ref": "#/definitions/MonitorSettings"
61+
},
62+
"storage-monitor": {
63+
"$ref": "#/definitions/MonitorSettings"
64+
},
65+
"nvidia": {
66+
"$ref": "#/definitions/MonitorSettings"
67+
},
68+
"neuron": {
69+
"$ref": "#/definitions/MonitorSettings"
70+
},
71+
"runtime": {
72+
"$ref": "#/definitions/MonitorSettings"
73+
}
74+
}
5075
}
5176
}
5277
},
@@ -123,6 +148,19 @@
123148
"type": "object"
124149
}
125150
},
151+
"MonitorSettings": {
152+
"title": "MonitorSettings",
153+
"type": "object",
154+
"description": "Per-monitor settings",
155+
"additionalProperties": false,
156+
"properties": {
157+
"enabled": {
158+
"type": "boolean",
159+
"description": "Whether this monitor is enabled",
160+
"default": true
161+
}
162+
}
163+
},
126164
"StringMap": {
127165
"title": "StringMap",
128166
"type": "object",

charts/eks-node-monitoring-agent/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ The following table lists the configurable parameters for this chart and their d
6161
| nodeAgent.image.pullPolicy | string | `"IfNotPresent"` | Container pull policyfor the eks-node-monitoring-agent |
6262
| nodeAgent.image.region | string | `"us-west-2"` | ECR repository region for the eks-node-monitoring-agent |
6363
| nodeAgent.image.tag | string | `"v1.5.2-eksbuild.1"` | Image tag for the eks-node-monitoring-agent |
64+
| nodeAgent.monitors | object | `{}` | Per-monitor configuration keyed by plugin name. See the main README for details. |
6465
| nodeAgent.podAnnotations | object | `{}` | Pod annotations applied to the eks-node-monitoring-agent |
6566
| nodeAgent.podLabels | object | `{}` | Pod labels applied to the eks-node-monitoring-agent |
6667
| nodeAgent.resources | object | `{"limits":{"cpu":"250m","memory":"200Mi"},"requests":{"cpu":"10m","memory":"30Mi"}}` | Container resources for the eks-node-monitoring-agent |

charts/eks-node-monitoring-agent/templates/daemonset.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,18 @@ spec:
7474
volumeMounts:
7575
- name: host-root
7676
mountPath: /host
77+
{{- if .Values.nodeAgent.monitors }}
78+
- name: monitor-config
79+
mountPath: /etc/nma
80+
readOnly: true
81+
{{- end }}
7782
volumes:
7883
- name: host-root
7984
hostPath:
8085
path: /
86+
{{- if .Values.nodeAgent.monitors }}
87+
- name: monitor-config
88+
configMap:
89+
name: {{ include "eks-node-monitoring-agent.fullname" . }}-monitor-config
90+
optional: true
91+
{{- end }}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{{- if .Values.nodeAgent.monitors }}
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
name: {{ include "eks-node-monitoring-agent.fullname" . }}-monitor-config
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
{{- include "eks-node-monitoring-agent.labels" . | nindent 4 }}
9+
data:
10+
config.yaml: |
11+
monitors:
12+
{{- toYaml .Values.nodeAgent.monitors | nindent 6 }}
13+
{{- end }}

charts/eks-node-monitoring-agent/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ nodeAgent:
8484
- operator: Exists
8585
# -- Pod labels applied to the eks-node-monitoring-agent
8686
podLabels: {}
87+
# -- Per-monitor configuration keyed by plugin name. See the main README for details.
88+
monitors: {}
8789
# -- Pod annotations applied to the eks-node-monitoring-agent
8890
podAnnotations: {}
8991

cmd/eks-node-monitoring-agent/main.go

Lines changed: 71 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"sigs.k8s.io/controller-runtime/pkg/log/zap"
2626
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
2727

28+
"github.com/aws/eks-node-monitoring-agent/api/monitor"
2829
"github.com/aws/eks-node-monitoring-agent/api/v1alpha1"
2930
"github.com/aws/eks-node-monitoring-agent/pkg/conditions"
3031
"github.com/aws/eks-node-monitoring-agent/pkg/config"
@@ -39,6 +40,7 @@ import (
3940
_ "github.com/aws/eks-node-monitoring-agent/monitors/neuron"
4041
_ "github.com/aws/eks-node-monitoring-agent/monitors/nvidia"
4142
_ "github.com/aws/eks-node-monitoring-agent/monitors/storage"
43+
4244
// Import monitors that require explicit registration (can't use init())
4345
"github.com/aws/eks-node-monitoring-agent/monitors/runtime"
4446
// Import observer packages to register observers
@@ -185,47 +187,88 @@ func run() error {
185187
return err
186188
}
187189

188-
// Get all registered monitors from the global plugin registry
189-
allMonitors := registry.GlobalRegistry().AllMonitors()
190-
if len(allMonitors) == 0 {
191-
logger.Info("no monitors registered - agent will run without monitoring capabilities")
192-
return fmt.Errorf("no monitors registered")
190+
// Load monitor configuration from ConfigMap mount
191+
monitorConfig, configFound, err := config.LoadMonitorConfig(config.DefaultConfigPath)
192+
if err != nil {
193+
logger.Error(err, "failed to load monitor configuration")
194+
return err
195+
}
196+
if !configFound {
197+
logger.Info("monitor config file not found, all monitors will be enabled by default", "path", config.DefaultConfigPath)
193198
}
194199

195-
logger.Info("registered monitors", "count", len(allMonitors))
196-
for _, mon := range allMonitors {
197-
logger.Info("monitor available", "name", mon.Name())
200+
// Filter plugins by configuration and log effective state
201+
allPlugins := registry.GlobalRegistry().List()
202+
var enabledMonitors []monitor.Monitor
203+
var disabledNames []string
204+
205+
for _, plugin := range allPlugins {
206+
enabled := monitorConfig.IsMonitorEnabled(plugin.Name())
207+
logger.Info("monitor configuration", "plugin", plugin.Name(), "enabled", enabled)
208+
if !enabled {
209+
disabledNames = append(disabledNames, plugin.Name())
210+
continue
211+
}
212+
enabledMonitors = append(enabledMonitors, plugin.Monitors()...)
198213
}
199214

200-
// Build condition configs for node exporter
215+
if len(disabledNames) > 0 {
216+
logger.Info("monitors disabled by configuration", "plugins", disabledNames)
217+
}
218+
219+
if len(enabledMonitors) == 0 {
220+
logger.Info("all monitors are disabled by configuration, NMA will not perform any monitoring")
221+
} else {
222+
logger.Info("enabled monitors", "count", len(enabledMonitors))
223+
for _, mon := range enabledMonitors {
224+
logger.Info("monitor available", "name", mon.Name())
225+
}
226+
}
227+
228+
// Build condition configs for node exporter, only for enabled monitors.
229+
// NodeExporter unconditionally sets all provided conditions to ConditionTrue,
230+
// so we must exclude conditions for disabled monitors to avoid falsely
231+
// reporting health for subsystems that are not being monitored.
201232
conditionConfigs := make(map[corev1.NodeConditionType]manager.NodeConditionConfig)
202-
conditionConfigs[conditions.KernelReady] = manager.NodeConditionConfig{
203-
ReadyReason: "KernelIsReady",
204-
ReadyMessage: "Monitoring for the Kernel system is active",
233+
if monitorConfig.IsMonitorEnabled("kernel-monitor") {
234+
conditionConfigs[conditions.KernelReady] = manager.NodeConditionConfig{
235+
ReadyReason: "KernelIsReady",
236+
ReadyMessage: "Monitoring for the Kernel system is active",
237+
}
205238
}
206-
conditionConfigs[conditions.StorageReady] = manager.NodeConditionConfig{
207-
ReadyReason: "DiskIsReady",
208-
ReadyMessage: "Monitoring for the Disk system is active",
239+
if monitorConfig.IsMonitorEnabled("storage-monitor") {
240+
conditionConfigs[conditions.StorageReady] = manager.NodeConditionConfig{
241+
ReadyReason: "DiskIsReady",
242+
ReadyMessage: "Monitoring for the Disk system is active",
243+
}
209244
}
210-
conditionConfigs[conditions.ContainerRuntimeReady] = manager.NodeConditionConfig{
211-
ReadyReason: "ContainerRuntimeIsReady",
212-
ReadyMessage: "Monitoring for the ContainerRuntime system is active",
245+
if monitorConfig.IsMonitorEnabled("runtime") {
246+
conditionConfigs[conditions.ContainerRuntimeReady] = manager.NodeConditionConfig{
247+
ReadyReason: "ContainerRuntimeIsReady",
248+
ReadyMessage: "Monitoring for the ContainerRuntime system is active",
249+
}
213250
}
214-
conditionConfigs[conditions.NetworkingReady] = manager.NodeConditionConfig{
215-
ReadyReason: "NetworkingIsReady",
216-
ReadyMessage: "Monitoring for the Networking system is active",
251+
if monitorConfig.IsMonitorEnabled("networking") {
252+
conditionConfigs[conditions.NetworkingReady] = manager.NodeConditionConfig{
253+
ReadyReason: "NetworkingIsReady",
254+
ReadyMessage: "Monitoring for the Networking system is active",
255+
}
217256
}
218257

219258
switch runtimeContext.AcceleratedHardware() {
220259
case config.AcceleratedHardwareNvidia:
221-
conditionConfigs[conditions.AcceleratedHardwareReady] = manager.NodeConditionConfig{
222-
ReadyReason: "NvidiaGPUIsReady",
223-
ReadyMessage: "Monitoring for the Nvidia GPU system is active",
260+
if monitorConfig.IsMonitorEnabled("nvidia") {
261+
conditionConfigs[conditions.AcceleratedHardwareReady] = manager.NodeConditionConfig{
262+
ReadyReason: "NvidiaGPUIsReady",
263+
ReadyMessage: "Monitoring for the Nvidia GPU system is active",
264+
}
224265
}
225266
case config.AcceleratedHardwareNeuron:
226-
conditionConfigs[conditions.AcceleratedHardwareReady] = manager.NodeConditionConfig{
227-
ReadyReason: "NeuronAcceleratedHardwareIsReady",
228-
ReadyMessage: "Monitoring for the Neuron AcceleratedHardware system is active",
267+
if monitorConfig.IsMonitorEnabled("neuron") {
268+
conditionConfigs[conditions.AcceleratedHardwareReady] = manager.NodeConditionConfig{
269+
ReadyReason: "NeuronAcceleratedHardwareIsReady",
270+
ReadyMessage: "Monitoring for the Neuron AcceleratedHardware system is active",
271+
}
229272
}
230273
}
231274

@@ -244,7 +287,7 @@ func run() error {
244287
monitorMgr := manager.NewMonitorManager(hostname, nodeExporter)
245288

246289
// Register all monitors with the manager
247-
for _, mon := range allMonitors {
290+
for _, mon := range enabledMonitors {
248291
monCtx := log.IntoContext(ctx, logger.WithValues("monitor", mon.Name()))
249292
var conditionType corev1.NodeConditionType
250293
switch mon.Name() {

pkg/config/monitor.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
package config
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
"os"
7+
"slices"
8+
"sort"
9+
"strings"
10+
11+
"sigs.k8s.io/yaml"
12+
)
13+
14+
const DefaultConfigPath = "/etc/nma/config.yaml"
15+
16+
// MonitorSettings holds per-monitor configuration.
17+
type MonitorSettings struct {
18+
Enabled *bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
19+
}
20+
21+
// IsEnabled returns true if the monitor is enabled.
22+
func (ms MonitorSettings) IsEnabled() bool {
23+
// Defaults to true when Enabled is nil (not explicitly set).
24+
// this is to ensure backward compatibility and consistency
25+
if ms.Enabled == nil {
26+
return true
27+
}
28+
return *ms.Enabled
29+
}
30+
31+
// MonitorConfig is the top-level configuration structure.
32+
type MonitorConfig struct {
33+
Monitors map[string]MonitorSettings `yaml:"monitors,omitempty" json:"monitors,omitempty"`
34+
}
35+
36+
// IsMonitorEnabled checks if a given plugin is enabled.
37+
// Returns true if the config is nil, the map is nil, or the plugin is not present in the map.
38+
func (mc *MonitorConfig) IsMonitorEnabled(pluginName string) bool {
39+
if mc == nil || mc.Monitors == nil {
40+
return true
41+
}
42+
settings, exists := mc.Monitors[pluginName]
43+
if !exists {
44+
return true
45+
}
46+
return settings.IsEnabled()
47+
}
48+
49+
// KnownPluginNames is the set of valid plugin names for validation.
50+
var KnownPluginNames = []string{
51+
"kernel-monitor",
52+
"networking",
53+
"storage-monitor",
54+
"nvidia",
55+
"neuron",
56+
"runtime",
57+
}
58+
59+
// Validate checks that all keys in Monitors are known plugin names.
60+
func (mc *MonitorConfig) Validate() error {
61+
if mc == nil || mc.Monitors == nil {
62+
return nil
63+
}
64+
var unknown []string
65+
for name := range mc.Monitors {
66+
if !slices.Contains(KnownPluginNames, name) {
67+
unknown = append(unknown, name)
68+
}
69+
}
70+
if len(unknown) > 0 {
71+
sort.Strings(unknown)
72+
return fmt.Errorf("unknown monitor plugin name(s): %s", strings.Join(unknown, ", "))
73+
}
74+
return nil
75+
}
76+
77+
// LoadMonitorConfig reads the config file at the given path.
78+
// Returns a default (all-enabled) config if the file does not exist.
79+
// Returns an error if the file exists but contains invalid YAML or unknown plugin names.
80+
// The second return value indicates whether the config file was found on disk.
81+
func LoadMonitorConfig(path string) (*MonitorConfig, bool, error) {
82+
data, err := os.ReadFile(path)
83+
if err != nil {
84+
if errors.Is(err, os.ErrNotExist) {
85+
return &MonitorConfig{}, false, nil
86+
}
87+
return nil, false, fmt.Errorf("reading monitor config: %w", err)
88+
}
89+
90+
// Empty file is treated as default config (all monitors enabled).
91+
if len(data) == 0 {
92+
return &MonitorConfig{}, true, nil
93+
}
94+
95+
cfg := &MonitorConfig{}
96+
if err := yaml.UnmarshalStrict(data, cfg); err != nil {
97+
return nil, false, fmt.Errorf("parsing monitor config: %w", err)
98+
}
99+
100+
if err := cfg.Validate(); err != nil {
101+
return nil, false, fmt.Errorf("validating monitor config: %w", err)
102+
}
103+
104+
return cfg, true, nil
105+
}

0 commit comments

Comments
 (0)