Skip to content

Commit e442ec1

Browse files
Fix stale heartbeat process PIDs cleanup issue (#902)
* dev/tools/resourcectl: fix persistent PID landmine and prevent arbitrary process termination #### What this PR does / why we need it: Fixes a logic vulnerability in `resourcectl cleanup` where stale heartbeat process PIDs remained permanently trapped in the local state file (`~/.local/resourcectl/state.json`) if the heartbeat process exited prematurely. When the OS eventually recycled that PID and assigned it to a new innocent process, the subsequent execution of `resourcectl cleanup` would violently terminate that unrelated process via `SIGKILL`. To fix this, we: - Safely verify the heartbeat process identity on Linux by reading and parsing `/proc/<pid>/cmdline` to ensure it contains the `"heartbeat"` and `"--name" <resource>` arguments before attempting to kill it. - Guarantee that stale PIDs are zeroed in `state.json` even if the kill operation fails. - Prune fully released resources completely from the state file rather than leaving empty structs, preventing unbounded file growth. - Ignore `ESRCH` errors if a heartbeat process is already dead. - Added comprehensive unit tests in `main_test.go` to reproduce the conditions and verify safety and cleanup behavior. #### Which issue(s) this PR is related to: Fixes b/511322601 #### Release Note ```release-note Fixed a logic issue in `resourcectl` where stale heartbeat PIDs could lead to arbitrary process termination due to PID recycling. ``` * address feedback * pr feedback
1 parent cf8f58f commit e442ec1

2 files changed

Lines changed: 276 additions & 5 deletions

File tree

dev/tools/resourcectl/main.go

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"os/exec"
2828
"os/signal"
2929
"path/filepath"
30+
"runtime"
3031
"strings"
3132
"syscall"
3233
"time"
@@ -300,6 +301,8 @@ func runCleanup(ctx context.Context) error {
300301
return err
301302
}
302303
var errs []error
304+
var remainingResources []BoskosResource
305+
303306
for i := range state.BoskosResources {
304307
r := &state.BoskosResources[i]
305308
if err := killHeartbeatProcess(ctx, r); err != nil {
@@ -310,12 +313,12 @@ func runCleanup(ctx context.Context) error {
310313

311314
if err := r.ReleaseFromBoskos(ctx); err != nil {
312315
errs = append(errs, err)
313-
} else {
314-
r.Name = ""
315-
r.Type = ""
316+
remainingResources = append(remainingResources, *r)
316317
}
317318
}
318319

320+
state.BoskosResources = remainingResources
321+
319322
if err := writeState(state); err != nil {
320323
return err
321324
}
@@ -329,20 +332,61 @@ func killHeartbeatProcess(ctx context.Context, r *BoskosResource) error {
329332
return nil
330333
}
331334

335+
log := klog.FromContext(ctx)
336+
337+
// Verify the process is actually the heartbeat process if we are on Linux.
338+
// On non-Linux platforms (e.g., macOS), we currently skip this verification
339+
// because /proc is not available. This leaves a small window for PID-recycling
340+
// issues on those platforms.
341+
if runtime.GOOS == "linux" {
342+
if !isHeartbeatProcess(r.HeartbeatPID, r.Name) {
343+
return nil
344+
}
345+
} else {
346+
log.V(4).Info("skipping heartbeat process verification on non-linux platform", "goos", runtime.GOOS, "pid", r.HeartbeatPID)
347+
}
348+
332349
process, err := os.FindProcess(r.HeartbeatPID)
333350
if err != nil {
334351
return fmt.Errorf("error finding heartbeat process: %w", err)
335352
}
336353

337-
// TODO: Verify the process is actually the heartbeat process
338-
339354
if err := process.Kill(); err != nil {
355+
if errors.Is(err, syscall.ESRCH) {
356+
return nil
357+
}
340358
return fmt.Errorf("error killing heartbeat process: %w", err)
341359
}
342360

343361
return nil
344362
}
345363

364+
// isHeartbeatProcess verifies if the process with the given PID is actually our heartbeat process.
365+
func isHeartbeatProcess(pid int, resourceName string) bool {
366+
cmdlinePath := fmt.Sprintf("/proc/%d/cmdline", pid)
367+
data, err := os.ReadFile(cmdlinePath)
368+
if err != nil {
369+
return false
370+
}
371+
372+
// The /proc/<pid>/cmdline file contains arguments separated by null bytes (\x00).
373+
args := strings.Split(string(data), "\x00")
374+
375+
// Check if the arguments contain "heartbeat" and "--name" <resourceName>
376+
hasHeartbeat := false
377+
hasName := false
378+
for i := 0; i < len(args); i++ {
379+
if args[i] == "heartbeat" {
380+
hasHeartbeat = true
381+
}
382+
if args[i] == "--name" && i+1 < len(args) && args[i+1] == resourceName {
383+
hasName = true
384+
}
385+
}
386+
387+
return hasHeartbeat && hasName
388+
}
389+
346390
// runHeartbeat sends periodic heartbeats to Boskos to keep the resource alive.
347391
// This is run as a child process of runGet.
348392
func runHeartbeat(ctx context.Context, name, owner string) error {

dev/tools/resourcectl/main_test.go

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
// Copyright 2026 The Kubernetes Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"context"
19+
"encoding/json"
20+
"errors"
21+
"fmt"
22+
"net/http"
23+
"net/http/httptest"
24+
"os"
25+
"os/exec"
26+
"runtime"
27+
"syscall"
28+
"testing"
29+
"time"
30+
)
31+
32+
func TestHelperProcess(t *testing.T) {
33+
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
34+
return
35+
}
36+
// Just sleep forever (or until killed)
37+
select {}
38+
}
39+
40+
func TestIsHeartbeatProcess(t *testing.T) {
41+
if runtime.GOOS != "linux" {
42+
t.Skip("skipping /proc based test on non-linux platform")
43+
}
44+
45+
cmd := exec.Command(os.Args[0], "-test.run=TestHelperProcess", "--", "heartbeat", "--name", "test-resource")
46+
cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
47+
if err := cmd.Start(); err != nil {
48+
t.Fatalf("failed to start helper process: %v", err)
49+
}
50+
defer func() {
51+
_ = cmd.Process.Kill()
52+
_ = cmd.Wait()
53+
}()
54+
// Let the OS populate /proc/<pid>/cmdline
55+
time.Sleep(50 * time.Millisecond)
56+
57+
// Verify isHeartbeatProcess returns true for our helper process
58+
if !isHeartbeatProcess(cmd.Process.Pid, "test-resource") {
59+
t.Errorf("expected isHeartbeatProcess to return true, but got false")
60+
}
61+
62+
// Verify isHeartbeatProcess returns false for a different resource name
63+
if isHeartbeatProcess(cmd.Process.Pid, "other-resource") {
64+
t.Errorf("expected isHeartbeatProcess to return false for different resource name, but got true")
65+
}
66+
67+
// Spawn a standard sleep command which does not have "heartbeat" arguments
68+
dummy := exec.Command("sleep", "10")
69+
if err := dummy.Start(); err != nil {
70+
t.Fatalf("failed to start dummy process: %v", err)
71+
}
72+
defer func() {
73+
_ = dummy.Process.Kill()
74+
_ = dummy.Wait()
75+
}()
76+
77+
if isHeartbeatProcess(dummy.Process.Pid, "test-resource") {
78+
t.Errorf("expected isHeartbeatProcess to return false for generic process, but got true")
79+
}
80+
}
81+
82+
func TestRunCleanup(t *testing.T) {
83+
// Create temporary home directory to sandbox state file
84+
tmpHome, err := os.MkdirTemp("", "resourcectl-test-home-*")
85+
if err != nil {
86+
t.Fatalf("failed to create temp dir: %v", err)
87+
}
88+
defer os.RemoveAll(tmpHome)
89+
90+
// Override HOME
91+
origHome := os.Getenv("HOME")
92+
os.Setenv("HOME", tmpHome)
93+
defer os.Setenv("HOME", origHome)
94+
95+
// Start a mock Boskos server
96+
mockBoskos := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
97+
// Just return 200 OK
98+
w.WriteHeader(http.StatusOK)
99+
fmt.Fprint(w, `{"status": "ok"}`)
100+
}))
101+
defer mockBoskos.Close()
102+
103+
// Override BOSKOS_HOST
104+
origBoskosHost := os.Getenv("BOSKOS_HOST")
105+
os.Setenv("BOSKOS_HOST", mockBoskos.URL)
106+
defer os.Setenv("BOSKOS_HOST", origBoskosHost)
107+
108+
// 1. Spawn a matching heartbeat process that we want to kill.
109+
cmd := exec.Command(os.Args[0], "-test.run=TestHelperProcess", "--", "heartbeat", "--name", "matching-resource")
110+
cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
111+
if err := cmd.Start(); err != nil {
112+
t.Fatalf("failed to start matching helper process: %v", err)
113+
}
114+
var cmdWaited bool
115+
defer func() {
116+
_ = cmd.Process.Kill()
117+
if !cmdWaited {
118+
_ = cmd.Wait()
119+
}
120+
}()
121+
122+
// 2. Spawn a recycled/innocent process that should NOT be killed.
123+
// We'll use a simple sleep command as the innocent process.
124+
innocentCmd := exec.Command("sleep", "30")
125+
if err := innocentCmd.Start(); err != nil {
126+
t.Fatalf("failed to start innocent process: %v", err)
127+
}
128+
defer func() {
129+
_ = innocentCmd.Process.Kill()
130+
_ = innocentCmd.Wait()
131+
}()
132+
133+
// Let the OS populate /proc/<pid>/cmdline for both processes
134+
time.Sleep(50 * time.Millisecond)
135+
136+
// 3. Generate a dead PID (start and exit)
137+
deadCmd := exec.Command("true")
138+
if err := deadCmd.Run(); err != nil {
139+
t.Fatalf("failed to run dead process: %v", err)
140+
}
141+
deadPid := deadCmd.Process.Pid
142+
143+
// Write initial state.json
144+
stateFile, err := stateFilePath()
145+
if err != nil {
146+
t.Fatalf("failed to get state file path: %v", err)
147+
}
148+
149+
initialState := State{
150+
BoskosResources: []BoskosResource{
151+
{
152+
Name: "matching-resource",
153+
Type: "gce-project",
154+
HeartbeatPID: cmd.Process.Pid,
155+
},
156+
{
157+
Name: "innocent-resource",
158+
Type: "gce-project",
159+
HeartbeatPID: innocentCmd.Process.Pid,
160+
},
161+
{
162+
Name: "dead-resource",
163+
Type: "gce-project",
164+
HeartbeatPID: deadPid,
165+
},
166+
},
167+
}
168+
169+
data, err := json.MarshalIndent(initialState, "", " ")
170+
if err != nil {
171+
t.Fatalf("failed to marshal initial state: %v", err)
172+
}
173+
if err := os.WriteFile(stateFile, data, 0600); err != nil {
174+
t.Fatalf("failed to write initial state file: %v", err)
175+
}
176+
177+
// Run cleanup!
178+
ctx := context.Background()
179+
err = runCleanup(ctx)
180+
if err != nil {
181+
t.Fatalf("runCleanup failed: %v", err)
182+
}
183+
184+
// 4. Verify results:
185+
// - The matching-resource heartbeat process should be killed.
186+
// - The innocent-resource process should STILL be running (not killed).
187+
// - The state file should be completely empty of resources (since all released successfully).
188+
189+
// Verify matching heartbeat is killed.
190+
// Since the child process was killed by SIGKILL, wait should return an error indicating it was signaled.
191+
err = cmd.Wait()
192+
cmdWaited = true
193+
if err == nil {
194+
t.Errorf("expected matching heartbeat process to be killed, but it exited cleanly")
195+
} else {
196+
var exitErr *exec.ExitError
197+
if errors.As(err, &exitErr) {
198+
status := exitErr.Sys().(syscall.WaitStatus)
199+
if !status.Signaled() || status.Signal() != syscall.SIGKILL {
200+
t.Errorf("expected process to be killed by SIGKILL, but got exit status: %v", err)
201+
}
202+
} else {
203+
t.Errorf("unexpected error waiting for process: %v", err)
204+
}
205+
}
206+
207+
// Verify innocent process is NOT killed (on Linux only, where verification is active).
208+
if runtime.GOOS == "linux" {
209+
process, err := os.FindProcess(innocentCmd.Process.Pid)
210+
if err == nil {
211+
err = process.Signal(syscall.Signal(0))
212+
if err != nil {
213+
t.Errorf("expected innocent process to still be running, but got error: %v", err)
214+
}
215+
}
216+
}
217+
218+
// Verify state file is empty.
219+
state, err := readState()
220+
if err != nil {
221+
t.Fatalf("failed to read state file: %v", err)
222+
}
223+
224+
if len(state.BoskosResources) != 0 {
225+
t.Errorf("expected 0 resources left in state.json, but got %d: %+v", len(state.BoskosResources), state.BoskosResources)
226+
}
227+
}

0 commit comments

Comments
 (0)