Skip to content

Commit b7d3ed3

Browse files
rcipdevRuchik Pravasi
andauthored
feat: add ZRAM usage monitoring to kernel monitor (#93)
Co-authored-by: Ruchik Pravasi <ruchikpi@amazon.com>
1 parent 8379e15 commit b7d3ed3

2 files changed

Lines changed: 106 additions & 0 deletions

File tree

monitors/kernel/monitor.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ func (m *KernelMonitor) Register(ctx context.Context, mgr monitor.Manager) error
6464
util.NewChannelHandler(func(time.Time) error { return m.handleZombies() }, util.TimeTickWithJitterContext(ctx, 5*time.Minute)),
6565
util.NewChannelHandler(func(time.Time) error { return m.handleOpenedFiles() }, util.TimeTickWithJitterContext(ctx, 5*time.Minute)),
6666
util.NewChannelHandler(func(time.Time) error { return m.handleEnvironment() }, util.TimeTickWithJitterContext(ctx, 5*time.Minute)),
67+
util.NewChannelHandler(func(time.Time) error { return m.handleZram() }, util.TimeTickWithJitterContext(ctx, 5*time.Minute)),
6768
} {
6869
go handler.Start(ctx)
6970
}
@@ -349,3 +350,61 @@ func (k *KernelMonitor) checkEnvironment(envBytes []byte, pid int) error {
349350
}
350351
return nil
351352
}
353+
354+
// ~~~~ zram ~~~~
355+
356+
func (k *KernelMonitor) handleZram() error {
357+
zramDirs, err := filepath.Glob(config.ToHostPath("/sys/block/zram*"))
358+
if err != nil {
359+
return err
360+
}
361+
if len(zramDirs) == 0 {
362+
k.logger.V(1).Info("ZRAM devices not found on this node")
363+
return nil
364+
}
365+
for _, dir := range zramDirs {
366+
deviceName := filepath.Base(dir)
367+
mmStatData, err := os.ReadFile(filepath.Join(dir, "mm_stat"))
368+
if err != nil {
369+
k.logger.V(1).Info("failed to read ZRAM mm_stat", "device", deviceName, "error", err)
370+
continue
371+
}
372+
fields := strings.Fields(string(mmStatData))
373+
// mm_stat format: orig_data_size compr_data_size mem_used_total ...
374+
// We need at least the first 2 fields
375+
if len(fields) < 2 {
376+
k.logger.V(1).Info("invalid ZRAM mm_stat format", "device", deviceName, "fields", len(fields))
377+
continue
378+
}
379+
origSize, _ := strconv.ParseInt(fields[0], 10, 64)
380+
compSize, _ := strconv.ParseInt(fields[1], 10, 64)
381+
if origSize == 0 {
382+
continue
383+
}
384+
disksizeData, err := os.ReadFile(filepath.Join(dir, "disksize"))
385+
if err != nil {
386+
continue
387+
}
388+
disksize, _ := strconv.ParseInt(strings.TrimSpace(string(disksizeData)), 10, 64)
389+
if err := k.checkZram(origSize, compSize, disksize, deviceName); err != nil {
390+
return err
391+
}
392+
}
393+
return nil
394+
}
395+
396+
func (k *KernelMonitor) checkZram(origSize, compSize, disksize int64, deviceName string) error {
397+
if disksize == 0 {
398+
return nil
399+
}
400+
usagePercent := float64(origSize) / float64(disksize)
401+
if usagePercent > 0.10 {
402+
return k.manager.Notify(context.Background(), monitor.Condition{
403+
Reason: "ZramHighUsage",
404+
Message: fmt.Sprintf("ZRAM device %s at %.1f%% capacity", deviceName, usagePercent*100),
405+
Severity: monitor.SeverityWarning,
406+
})
407+
}
408+
return nil
409+
410+
}

monitors/kernel/monitor_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,4 +289,51 @@ func TestKernelPeriodic(t *testing.T) {
289289
}
290290
assert.Equal(t, 0, len(mockManager.res))
291291
})
292+
293+
t.Run("ZramHighUsageNoop", func(t *testing.T) {
294+
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
295+
defer cancel()
296+
mon := &KernelMonitor{}
297+
mockManager := &mockManager{res: make(chan monitor.Condition, 5)}
298+
mon.Register(ctx, mockManager)
299+
300+
if err := mon.checkZram(10*1024*1024, 5*1024*1024, 1024*1024*1024, "zram0"); err != nil {
301+
t.Fatal(err)
302+
}
303+
assert.Equal(t, 0, len(mockManager.res))
304+
305+
if err := mon.checkZram(50*1024*1024, 25*1024*1024, 1024*1024*1024, "zram1"); err != nil {
306+
t.Fatal(err)
307+
}
308+
assert.Equal(t, 0, len(mockManager.res))
309+
310+
if err := mon.checkZram(99*1024*1024, 50*1024*1024, 1000*1024*1024, "zram2"); err != nil {
311+
t.Fatal(err)
312+
}
313+
assert.Equal(t, 0, len(mockManager.res))
314+
})
315+
316+
t.Run("ZramHighUsageWarning", func(t *testing.T) {
317+
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
318+
defer cancel()
319+
mon := &KernelMonitor{}
320+
mockManager := &mockManager{res: make(chan monitor.Condition, 5)}
321+
mon.Register(ctx, mockManager)
322+
// 15% usage - above 10% threshold
323+
origSize := int64(150 * 1024 * 1024)
324+
compSize := int64(75 * 1024 * 1024)
325+
diskSize := int64(1024 * 1024 * 1024)
326+
if err := mon.checkZram(origSize, compSize, diskSize, "zram0"); err != nil {
327+
t.Fatal(err)
328+
}
329+
select {
330+
case <-ctx.Done():
331+
t.Fatal(ctx.Err())
332+
case monitorResult := <-mockManager.res:
333+
assert.Equal(t, monitor.SeverityWarning, monitorResult.Severity)
334+
assert.Equal(t, "ZramHighUsage", monitorResult.Reason)
335+
assert.Contains(t, monitorResult.Message, "zram0")
336+
assert.Contains(t, monitorResult.Message, "capacity")
337+
}
338+
})
292339
}

0 commit comments

Comments
 (0)