Skip to content

Commit 25a173f

Browse files
committed
[supervisor] add max retries on exit code errors
1 parent 95aa393 commit 25a173f

File tree

2 files changed

+197
-68
lines changed

2 files changed

+197
-68
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package supervisor
6+
7+
import (
8+
"context"
9+
"errors"
10+
"fmt"
11+
"net"
12+
"os"
13+
"os/exec"
14+
"path/filepath"
15+
"strings"
16+
"sync"
17+
"syscall"
18+
"time"
19+
20+
"github.com/cenkalti/backoff/v4"
21+
22+
"github.com/gitpod-io/gitpod/common-go/analytics"
23+
"github.com/gitpod-io/gitpod/common-go/log"
24+
"github.com/gitpod-io/gitpod/common-go/process"
25+
"github.com/gitpod-io/gitpod/supervisor/api"
26+
"github.com/gitpod-io/gitpod/supervisor/pkg/activation"
27+
"github.com/gitpod-io/gitpod/supervisor/pkg/terminal"
28+
)
29+
30+
// if exit error happens one after another within dockerStartErrorBurstDuration then we're in the burst
31+
// and should stop trying after maxBurstDockerStartAttempts
32+
const (
33+
maxBurstDockerStartAttempts = 15
34+
dockerStartErrorBurstDuration = time.Minute * 1
35+
maxIntervalBetweenDockerStart = 15 * time.Second
36+
)
37+
38+
func socketActivationForDocker(parentCtx context.Context, wg *sync.WaitGroup, term *terminal.Mux, cfg *Config, telemetry analytics.Writer, notifications *NotificationService) {
39+
defer wg.Done()
40+
41+
ctx, cancel := context.WithCancel(parentCtx)
42+
defer cancel()
43+
44+
burstAttempts := 0
45+
backoffStrategy := backoff.NewExponentialBackOff()
46+
backoffStrategy.MaxInterval = maxIntervalBetweenDockerStart
47+
var lastExitErrorTime time.Time
48+
49+
for ctx.Err() == nil {
50+
err := listenToDockerSocket(ctx, term, cfg, telemetry)
51+
52+
var exitError *exec.ExitError
53+
if err != nil {
54+
exitError, _ = err.(*exec.ExitError)
55+
}
56+
if exitError != nil && exitError.ExitCode() > 0 {
57+
// docker-up or daemon exited with an error - we'll try again
58+
// it can be a transient condition like apt is locked by another process to install prerequisites
59+
// or permament like misconfigured env var, image which does not allow to install prerequisites at all
60+
// or any general issue with docker daemon startup
61+
if time.Since(lastExitErrorTime) <= dockerStartErrorBurstDuration {
62+
// we're in the eixt error burst
63+
burstAttempts++
64+
}
65+
lastExitErrorTime = time.Now()
66+
} else {
67+
// transient condition like a docker daemon get killed because of OOM
68+
burstAttempts = 0
69+
backoffStrategy.Reset()
70+
}
71+
nextBackOff := backoffStrategy.NextBackOff()
72+
73+
if nextBackOff == backoff.Stop || burstAttempts >= maxBurstDockerStartAttempts {
74+
cancel()
75+
// we don't want to try forever to avoid spamming our logs
76+
log.WithError(err).WithField("attempts", burstAttempts).Error("cannot activate docker after maximum attempts")
77+
_, _ = notifications.Notify(ctx, &api.NotifyRequest{
78+
Level: api.NotifyRequest_ERROR,
79+
Message: "Docker daemon failing to start, `cat /workspace/.gitpod/logs/docker-up.log` to see logs.",
80+
})
81+
continue
82+
}
83+
84+
if err != nil &&
85+
!errors.Is(err, context.Canceled) &&
86+
!process.IsNotChildProcess(err) &&
87+
!strings.Contains(err.Error(), "signal: ") {
88+
// don't log typical transient errors
89+
log.WithError(err).WithField("attempts", burstAttempts).WithField("backoff", nextBackOff.String()).Error("cannot activate docker, retrying...")
90+
}
91+
time.Sleep(nextBackOff)
92+
}
93+
}
94+
95+
// listenToDockerSocket listens to the docker socket and starts the docker-up process
96+
// if it fails to start then listener is closed to requests all incoming requests and prevent infinite try
97+
func listenToDockerSocket(parentCtx context.Context, term *terminal.Mux, cfg *Config, telemetry analytics.Writer) error {
98+
ctx, cancel := context.WithCancel(parentCtx)
99+
defer cancel()
100+
101+
fn := "/var/run/docker.sock"
102+
l, err := net.Listen("unix", fn)
103+
if err != nil {
104+
return err
105+
}
106+
107+
go func() {
108+
<-ctx.Done()
109+
l.Close()
110+
}()
111+
112+
_ = os.Chown(fn, gitpodUID, gitpodGID)
113+
return activation.Listen(ctx, l, func(socketFD *os.File) error {
114+
startTime := time.Now()
115+
defer socketFD.Close()
116+
cmd := exec.Command("/usr/bin/docker-up")
117+
cmd.Env = append(os.Environ(), "LISTEN_FDS=1")
118+
cmd.ExtraFiles = []*os.File{socketFD}
119+
alias, err := term.Start(cmd, terminal.TermOptions{
120+
Annotations: map[string]string{
121+
"gitpod.supervisor": "true",
122+
},
123+
LogToStdout: true,
124+
})
125+
telemetry.Track(analytics.TrackMessage{
126+
Identity: analytics.Identity{UserID: cfg.OwnerId},
127+
Event: "gitpod_activate_docker",
128+
Properties: map[string]interface{}{
129+
"instanceId": cfg.WorkspaceInstanceID,
130+
"workspaceId": cfg.WorkspaceID,
131+
"debugWorkspace": cfg.isDebugWorkspace(),
132+
},
133+
})
134+
if err != nil {
135+
return err
136+
}
137+
pty, ok := term.Get(alias)
138+
if !ok {
139+
return errors.New("cannot find pty")
140+
}
141+
go logDockerUp(startTime, pty)
142+
ptyCtx, cancel := context.WithCancel(ctx)
143+
go func(ptyCtx context.Context) {
144+
select {
145+
case <-ctx.Done():
146+
_ = pty.Command.Process.Signal(syscall.SIGTERM)
147+
case <-ptyCtx.Done():
148+
}
149+
}(ptyCtx)
150+
_, err = pty.Wait()
151+
cancel()
152+
return err
153+
})
154+
}
155+
156+
func logDockerUp(startTime time.Time, pty *terminal.Term) {
157+
logsDir := "/workspace/.gitpod/logs"
158+
if err := os.MkdirAll(logsDir, 0755); err != nil {
159+
log.WithError(err).Error("cannot create logs dir")
160+
return
161+
}
162+
if err := os.Chown(logsDir, gitpodUID, gitpodGID); err != nil {
163+
log.WithError(err).Error("cannot chown logs dir")
164+
return
165+
}
166+
logFilePath := filepath.Join(logsDir, "docker-up.log")
167+
logFile, err := os.OpenFile(logFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
168+
if err != nil {
169+
log.WithError(err).Error("cannot open docker-up log file")
170+
return
171+
}
172+
defer logFile.Close()
173+
174+
if err := os.Chown(logFilePath, gitpodUID, gitpodGID); err != nil {
175+
log.WithError(err).Error("cannot chown docker-up log file")
176+
return
177+
}
178+
fmt.Fprintf(logFile, "======= Start docker-up at %s =======\n", startTime.Format(time.RFC3339))
179+
180+
stdout := pty.Stdout.Listen()
181+
defer stdout.Close()
182+
183+
for {
184+
buf := make([]byte, 4096)
185+
n, err := stdout.Read(buf)
186+
if err != nil {
187+
break
188+
}
189+
190+
if _, err := logFile.Write(buf[:n]); err != nil {
191+
log.WithError(err).Error("cannot write to docker-up log file")
192+
}
193+
}
194+
195+
fmt.Fprintf(logFile, "\n======= Stop docker-up at %s =======\n", time.Now().Format(time.RFC3339))
196+
}

components/supervisor/pkg/supervisor/supervisor.go

+1-68
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,11 @@ import (
5050
"github.com/gitpod-io/gitpod/common-go/analytics"
5151
"github.com/gitpod-io/gitpod/common-go/log"
5252
"github.com/gitpod-io/gitpod/common-go/pprof"
53-
"github.com/gitpod-io/gitpod/common-go/process"
5453
csapi "github.com/gitpod-io/gitpod/content-service/api"
5554
"github.com/gitpod-io/gitpod/content-service/pkg/executor"
5655
"github.com/gitpod-io/gitpod/content-service/pkg/git"
5756
gitpod "github.com/gitpod-io/gitpod/gitpod-protocol"
5857
"github.com/gitpod-io/gitpod/supervisor/api"
59-
"github.com/gitpod-io/gitpod/supervisor/pkg/activation"
6058
"github.com/gitpod-io/gitpod/supervisor/pkg/config"
6159
"github.com/gitpod-io/gitpod/supervisor/pkg/dropwriter"
6260
"github.com/gitpod-io/gitpod/supervisor/pkg/metrics"
@@ -409,7 +407,7 @@ func Run(options ...RunOption) {
409407

410408
if !opts.RunGP {
411409
wg.Add(1)
412-
go socketActivationForDocker(ctx, &wg, termMux, cfg, telemetry)
410+
go socketActivationForDocker(ctx, &wg, termMux, cfg, telemetry, notificationService)
413411
}
414412

415413
if cfg.isHeadless() {
@@ -1548,71 +1546,6 @@ func recordInitializerMetrics(path string, metrics *metrics.SupervisorMetrics) {
15481546
}
15491547
}
15501548

1551-
func socketActivationForDocker(ctx context.Context, wg *sync.WaitGroup, term *terminal.Mux, cfg *Config, w analytics.Writer) {
1552-
defer wg.Done()
1553-
1554-
fn := "/var/run/docker.sock"
1555-
l, err := net.Listen("unix", fn)
1556-
if err != nil {
1557-
log.WithError(err).Error("cannot provide Docker activation socket")
1558-
return
1559-
}
1560-
1561-
go func() {
1562-
<-ctx.Done()
1563-
l.Close()
1564-
}()
1565-
1566-
_ = os.Chown(fn, gitpodUID, gitpodGID)
1567-
for ctx.Err() == nil {
1568-
err = activation.Listen(ctx, l, func(socketFD *os.File) error {
1569-
defer socketFD.Close()
1570-
cmd := exec.Command("/usr/bin/docker-up")
1571-
cmd.Env = append(os.Environ(), "LISTEN_FDS=1")
1572-
cmd.ExtraFiles = []*os.File{socketFD}
1573-
alias, err := term.Start(cmd, terminal.TermOptions{
1574-
Annotations: map[string]string{
1575-
"gitpod.supervisor": "true",
1576-
},
1577-
LogToStdout: true,
1578-
})
1579-
w.Track(analytics.TrackMessage{
1580-
Identity: analytics.Identity{UserID: cfg.OwnerId},
1581-
Event: "gitpod_activate_docker",
1582-
Properties: map[string]interface{}{
1583-
"instanceId": cfg.WorkspaceInstanceID,
1584-
"workspaceId": cfg.WorkspaceID,
1585-
"debugWorkspace": cfg.isDebugWorkspace(),
1586-
},
1587-
})
1588-
if err != nil {
1589-
return err
1590-
}
1591-
pty, ok := term.Get(alias)
1592-
if !ok {
1593-
return errors.New("cannot find pty")
1594-
}
1595-
ptyCtx, cancel := context.WithCancel(context.Background())
1596-
go func(ptyCtx context.Context) {
1597-
select {
1598-
case <-ctx.Done():
1599-
_ = pty.Command.Process.Signal(syscall.SIGTERM)
1600-
case <-ptyCtx.Done():
1601-
}
1602-
}(ptyCtx)
1603-
_, err = pty.Wait()
1604-
cancel()
1605-
return err
1606-
})
1607-
if err != nil &&
1608-
!errors.Is(err, context.Canceled) &&
1609-
!process.IsNotChildProcess(err) &&
1610-
!strings.Contains(err.Error(), "signal: ") {
1611-
log.WithError(err).Error("cannot provide Docker activation socket")
1612-
}
1613-
}
1614-
}
1615-
16161549
type PerfAnalyzer struct {
16171550
label string
16181551
defs []int

0 commit comments

Comments
 (0)