Skip to content

Commit 005d2be

Browse files
committed
dashboard/coordinator: periodically clean stale VMs
This isn't used yet, but will be for the new-style builders (VMs on GCE running the buildlet, started by the coordinator). From the code's comments: cleanUpOldVMs periodically enumerates virtual machines and deletes any which have a "delete-at" attribute having a unix timestamp before the current time. These VMs are created to run a single build and should be shut down by a controlling process. Due to various types of failures, they might get stranded. To prevent them from getting stranded and wasting resources forever, we instead set the "delete-at" metadata attribute on them when created to some time that's well beyond their expected lifetime, and then this is the backup mechanism to delete them if they get away. Update golang/go#8639 Update golang/go#8640 Update golang/go#8642 Change-Id: I489e97926e7ab56487571c2bf0bd255cdf49570d Reviewed-on: https://go-review.googlesource.com/2199 Reviewed-by: Burcu Dogan <[email protected]>
1 parent faf0ad1 commit 005d2be

File tree

2 files changed

+116
-2
lines changed

2 files changed

+116
-2
lines changed

dashboard/coordinator/Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
coordinator: main.go
2-
GOOS=linux go build -o coordinator .
2+
GOOS=linux go build --tags=build_coordinator -o coordinator .
33

4+
# After "make upload", either reboot the machine, or ssh to it and:
5+
# sudo systemctl restart gobuild.service
6+
# And watch its logs with:
7+
# sudo journalctl -f -u gobuild.service
48
upload: coordinator
59
cat coordinator | (cd buildongce && go run create.go --write_object=go-builder-data/coordinator)
6-

dashboard/coordinator/main.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
// +build build_coordinator
6+
57
// The coordinator runs on GCE and coordinates builds in Docker containers.
68
package main // import "golang.org/x/tools/dashboard/coordinator"
79

@@ -20,15 +22,23 @@ import (
2022
"os"
2123
"os/exec"
2224
"sort"
25+
"strconv"
2326
"strings"
2427
"sync"
2528
"time"
29+
30+
"golang.org/x/oauth2"
31+
"golang.org/x/oauth2/google"
32+
"google.golang.org/api/compute/v1"
33+
"google.golang.org/cloud/compute/metadata"
2634
)
2735

2836
var (
2937
masterKeyFile = flag.String("masterkey", "", "Path to builder master key. Else fetched using GCE project attribute 'builder-master-key'.")
3038
maxBuilds = flag.Int("maxbuilds", 6, "Max concurrent builds")
3139

40+
cleanZones = flag.String("zones", "us-central1-a,us-central1-b,us-central1-f", "Comma-separated list of zones to periodically clean of stale build VMs (ones that failed to shut themselves down)")
41+
3242
// Debug flags:
3343
addTemp = flag.Bool("temp", false, "Append -temp to all builders.")
3444
just = flag.String("just", "", "If non-empty, run single build in the foreground. Requires rev.")
@@ -131,6 +141,7 @@ func main() {
131141
go http.ListenAndServe(":80", nil)
132142

133143
go cleanUpOldContainers()
144+
go cleanUpOldVMs()
134145

135146
for _, watcher := range watchers {
136147
if err := startWatching(watchers[watcher.repo]); err != nil {
@@ -581,3 +592,103 @@ func oldContainers() []string {
581592
out, _ := exec.Command("docker", "ps", "-a", "--filter=status=exited", "--no-trunc", "-q").Output()
582593
return strings.Fields(string(out))
583594
}
595+
596+
// cleanUpOldVMs loops forever and periodically enumerates virtual
597+
// machines and deletes those which have expired.
598+
//
599+
// A VM is considered expired if it has a "delete-at" metadata
600+
// attribute having a unix timestamp before the current time.
601+
//
602+
// This is the safety mechanism to delete VMs which stray from the
603+
// normal deleting process. VMs are created to run a single build and
604+
// should be shut down by a controlling process. Due to various types
605+
// of failures, they might get stranded. To prevent them from getting
606+
// stranded and wasting resources forever, we instead set the
607+
// "delete-at" metadata attribute on them when created to some time
608+
// that's well beyond their expected lifetime.
609+
func cleanUpOldVMs() {
610+
if !hasComputeScope() {
611+
log.Printf("The coordinator is not running with access to read and write Compute resources. Background VM cleaning disabled.")
612+
return
613+
}
614+
ts := google.ComputeTokenSource("default")
615+
computeService, _ := compute.New(oauth2.NewClient(oauth2.NoContext, ts))
616+
for {
617+
for _, zone := range strings.Split(*cleanZones, ",") {
618+
zone = strings.TrimSpace(zone)
619+
if err := cleanZoneVMs(computeService, zone); err != nil {
620+
log.Printf("Error cleaning VMs in zone %q: %v", zone, err)
621+
}
622+
}
623+
time.Sleep(time.Minute)
624+
}
625+
}
626+
627+
// cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone.
628+
func cleanZoneVMs(svc *compute.Service, zone string) error {
629+
proj, err := metadata.ProjectID()
630+
if err != nil {
631+
return fmt.Errorf("failed to get current GCE ProjectID: %v", err)
632+
}
633+
// Fetch the first 500 (default) running instances and clean
634+
// thoes. We expect that we'll be running many fewer than
635+
// that. Even if we have more, eventually the first 500 will
636+
// either end or be cleaned, and then the next call will get a
637+
// partially-different 500.
638+
// TODO(bradfitz): revist this code if we ever start running
639+
// thousands of VMs.
640+
list, err := svc.Instances.List(proj, zone).Do()
641+
if err != nil {
642+
return fmt.Errorf("listing instances: %v", err)
643+
}
644+
for _, inst := range list.Items {
645+
if inst.Metadata == nil {
646+
// Defensive. Not seen in practice.
647+
continue
648+
}
649+
for _, it := range inst.Metadata.Items {
650+
if it.Key == "delete-at" {
651+
unixDeadline, err := strconv.ParseInt(it.Value, 10, 64)
652+
if err != nil {
653+
log.Printf("invalid delete-at value %q seen; ignoring", it.Value)
654+
}
655+
if err == nil && time.Now().Unix() > unixDeadline {
656+
log.Printf("Deleting expired VM %q in zone %q ...", inst.Name, zone)
657+
deleteVM(svc, zone, inst.Name)
658+
}
659+
}
660+
}
661+
}
662+
return nil
663+
}
664+
665+
func deleteVM(svc *compute.Service, zone, instName string) {
666+
proj, err := metadata.ProjectID()
667+
if err != nil {
668+
log.Printf("failed to get project id to delete instace: %v", err)
669+
return
670+
}
671+
op, err := svc.Instances.Delete(proj, zone, instName).Do()
672+
if err != nil {
673+
log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err)
674+
return
675+
}
676+
log.Printf("Sent request to delete instance %q in zone %q. Operation ID == %v", instName, zone, op.Id)
677+
}
678+
679+
func hasComputeScope() bool {
680+
if !metadata.OnGCE() {
681+
return false
682+
}
683+
scopes, err := metadata.Scopes("default")
684+
if err != nil {
685+
log.Printf("failed to query metadata default scopes: %v", err)
686+
return false
687+
}
688+
for _, v := range scopes {
689+
if v == compute.DevstorageFull_controlScope {
690+
return true
691+
}
692+
}
693+
return false
694+
}

0 commit comments

Comments
 (0)