Skip to content

Commit d095988

Browse files
committed
[supervisor] add max retries on exit code errors
1 parent 95aa393 commit d095988

File tree

2 files changed

+261
-68
lines changed

2 files changed

+261
-68
lines changed
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package supervisor
6+
7+
import (
8+
"context"
9+
"errors"
10+
"fmt"
11+
"io"
12+
"net"
13+
"os"
14+
"os/exec"
15+
"strings"
16+
"sync"
17+
"syscall"
18+
"time"
19+
20+
"github.com/cenkalti/backoff/v4"
21+
"golang.org/x/xerrors"
22+
23+
"github.com/gitpod-io/gitpod/common-go/analytics"
24+
"github.com/gitpod-io/gitpod/common-go/log"
25+
"github.com/gitpod-io/gitpod/common-go/process"
26+
"github.com/gitpod-io/gitpod/supervisor/api"
27+
"github.com/gitpod-io/gitpod/supervisor/pkg/activation"
28+
"github.com/gitpod-io/gitpod/supervisor/pkg/dropwriter"
29+
"github.com/gitpod-io/gitpod/supervisor/pkg/terminal"
30+
)
31+
32+
// if exit error happens one after another within dockerStartErrorBurstDuration then we're in the burst
33+
// and should stop trying after maxBurstDockerStartAttempts
34+
const (
35+
maxBurstDockerStartAttempts = 5
36+
dockerStartErrorBurstDuration = time.Minute * 1
37+
maxIntervalBetweenDockerStart = 15 * time.Second
38+
39+
logsDir = "/workspace/.gitpod/logs"
40+
dockerUpLogFilePath = logsDir + "/docker-up.log"
41+
)
42+
43+
func socketActivationForDocker(parentCtx context.Context, wg *sync.WaitGroup, term *terminal.Mux, cfg *Config, telemetry analytics.Writer, notifications *NotificationService) {
44+
defer wg.Done()
45+
46+
ctx, cancel := context.WithCancel(parentCtx)
47+
defer cancel()
48+
49+
logFile, err := openDockerUpLogFile()
50+
if err != nil {
51+
log.WithError(err).Error("docker-ip: cannot open log file")
52+
} else {
53+
defer logFile.Close()
54+
}
55+
56+
var notificationSent bool
57+
var disableNotificaiton bool
58+
notifyDockerUpFail := func() error {
59+
if disableNotificaiton || notificationSent {
60+
return nil
61+
}
62+
notificationSent = true
63+
defer func() {
64+
notificationSent = false
65+
}()
66+
openLogs := "Open Logs"
67+
dontShowAgain := "Don't Show Again"
68+
resp, err := notifications.Notify(ctx, &api.NotifyRequest{
69+
Level: api.NotifyRequest_ERROR,
70+
Message: "Docker daemon is failing to start.",
71+
Actions: []string{openLogs, dontShowAgain},
72+
})
73+
if err != nil {
74+
return err
75+
}
76+
telemetry.Track(analytics.TrackMessage{
77+
Identity: analytics.Identity{UserID: cfg.OwnerId},
78+
Event: "gitpod_activate_docker_fail_notification",
79+
Properties: map[string]interface{}{
80+
"instanceId": cfg.WorkspaceInstanceID,
81+
"workspaceId": cfg.WorkspaceID,
82+
"debugWorkspace": cfg.isDebugWorkspace(),
83+
"action": resp.Action,
84+
},
85+
})
86+
if resp.Action == dontShowAgain {
87+
disableNotificaiton = true
88+
return nil
89+
}
90+
if resp.Action != openLogs {
91+
return nil
92+
}
93+
gpPath, err := exec.LookPath("gp")
94+
if err != nil {
95+
return err
96+
}
97+
gpCmd := exec.CommandContext(ctx, gpPath, "open", dockerUpLogFilePath)
98+
gpCmd.Env = childProcEnvvars
99+
gpCmd.Stdout = os.Stdout
100+
gpCmd.Stderr = os.Stderr
101+
return gpCmd.Run()
102+
}
103+
104+
burstAttempts := 0
105+
backoffStrategy := backoff.NewExponentialBackOff()
106+
backoffStrategy.MaxInterval = maxIntervalBetweenDockerStart
107+
var lastExitErrorTime time.Time
108+
109+
for ctx.Err() == nil {
110+
err := listenToDockerSocket(ctx, term, cfg, telemetry, logFile)
111+
if ctx.Err() != nil {
112+
return
113+
}
114+
115+
var exitError *exec.ExitError
116+
if err != nil {
117+
exitError, _ = err.(*exec.ExitError)
118+
}
119+
if exitError != nil && exitError.ExitCode() > 0 {
120+
// docker-up or daemon exited with an error - we'll try again
121+
// it can be a transient condition like apt is locked by another process to install prerequisites
122+
// or permament like misconfigured env var, image which does not allow to install prerequisites at all
123+
// or any general issue with docker daemon startup
124+
if time.Since(lastExitErrorTime) <= dockerStartErrorBurstDuration {
125+
// we're in the eixt error burst
126+
burstAttempts++
127+
} else {
128+
// no burst, reset the counter
129+
burstAttempts = 0
130+
backoffStrategy.Reset()
131+
}
132+
lastExitErrorTime = time.Now()
133+
134+
go func() {
135+
notifyErr := notifyDockerUpFail()
136+
if notifyErr != nil {
137+
log.WithError(notifyErr).Error("cannot notify about docker-up failure")
138+
}
139+
}()
140+
} else {
141+
// transient condition like a docker daemon get killed because of OOM
142+
burstAttempts = 0
143+
backoffStrategy.Reset()
144+
}
145+
nextBackOff := backoffStrategy.NextBackOff()
146+
147+
if nextBackOff == backoff.Stop || burstAttempts >= maxBurstDockerStartAttempts {
148+
cancel()
149+
log.WithError(err).WithField("attempts", burstAttempts).Error("cannot activate docker after maximum attempts, stopping trying permanently")
150+
if logFile != nil {
151+
fmt.Fprintln(logFile, "Cannot activate docker after maximum attempts, stopping trying permanently.")
152+
fmt.Fprintln(logFile, "Please check logs above, fix the configuration, use `gp validate` to verify change, and commit to apply.")
153+
fmt.Fprintf(logFile, "If it does not help, please reach out to the support. Don't forget to share your workspace ID: %s.\n", cfg.WorkspaceID)
154+
}
155+
continue
156+
}
157+
158+
if err != nil &&
159+
!errors.Is(err, context.Canceled) &&
160+
!process.IsNotChildProcess(err) &&
161+
!strings.Contains(err.Error(), "signal: ") {
162+
// don't log typical transient errors
163+
log.WithError(err).WithField("attempts", burstAttempts).WithField("backoff", nextBackOff.String()).Error("cannot activate docker, retrying...")
164+
}
165+
time.Sleep(nextBackOff)
166+
}
167+
}
168+
169+
// listenToDockerSocket listens to the docker socket and starts the docker-up process
170+
// if it fails to start then listener is closed to requests all incoming requests and prevent infinite try
171+
func listenToDockerSocket(parentCtx context.Context, term *terminal.Mux, cfg *Config, telemetry analytics.Writer, logFile *os.File) error {
172+
ctx, cancel := context.WithCancel(parentCtx)
173+
defer cancel()
174+
175+
fn := "/var/run/docker.sock"
176+
l, err := net.Listen("unix", fn)
177+
if err != nil {
178+
return err
179+
}
180+
181+
go func() {
182+
<-ctx.Done()
183+
l.Close()
184+
}()
185+
186+
_ = os.Chown(fn, gitpodUID, gitpodGID)
187+
return activation.Listen(ctx, l, func(socketFD *os.File) error {
188+
defer socketFD.Close()
189+
startTime := time.Now()
190+
telemetry.Track(analytics.TrackMessage{
191+
Identity: analytics.Identity{UserID: cfg.OwnerId},
192+
Event: "gitpod_activate_docker",
193+
Properties: map[string]interface{}{
194+
"instanceId": cfg.WorkspaceInstanceID,
195+
"workspaceId": cfg.WorkspaceID,
196+
"debugWorkspace": cfg.isDebugWorkspace(),
197+
},
198+
})
199+
200+
var stdout, stderr io.Writer
201+
202+
stdout = os.Stdout
203+
stderr = os.Stderr
204+
if cfg.WorkspaceLogRateLimit > 0 {
205+
limit := int64(cfg.WorkspaceLogRateLimit)
206+
stdout = dropwriter.Writer(stdout, dropwriter.NewBucket(limit*1024*3, limit*1024))
207+
stderr = dropwriter.Writer(stderr, dropwriter.NewBucket(limit*1024*3, limit*1024))
208+
log.WithField("limit_kb_per_sec", limit).Info("docker-up: rate limiting log output")
209+
}
210+
211+
if logFile != nil {
212+
defer fmt.Fprintf(logFile, "\n======= Stop docker-up at %s =======\n", time.Now().Format(time.RFC3339))
213+
fmt.Fprintf(logFile, "======= Start docker-up at %s =======\n", startTime.Format(time.RFC3339))
214+
215+
stdout = io.MultiWriter(stdout, logFile)
216+
stderr = io.MultiWriter(stderr, logFile)
217+
}
218+
219+
cmd := exec.CommandContext(ctx, "/usr/bin/docker-up")
220+
cmd.Env = append(os.Environ(), "LISTEN_FDS=1")
221+
cmd.ExtraFiles = []*os.File{socketFD}
222+
cmd.Stdout = stdout
223+
cmd.Stderr = stderr
224+
225+
err = cmd.Start()
226+
if err != nil {
227+
return err
228+
}
229+
ptyCtx, cancel := context.WithCancel(ctx)
230+
go func(ptyCtx context.Context) {
231+
select {
232+
case <-ctx.Done():
233+
_ = cmd.Process.Signal(syscall.SIGTERM)
234+
case <-ptyCtx.Done():
235+
}
236+
}(ptyCtx)
237+
err = cmd.Wait()
238+
cancel()
239+
return err
240+
})
241+
}
242+
243+
func openDockerUpLogFile() (*os.File, error) {
244+
if err := os.MkdirAll(logsDir, 0755); err != nil {
245+
return nil, xerrors.Errorf("cannot create logs dir: %w", err)
246+
}
247+
if err := os.Chown(logsDir, gitpodUID, gitpodGID); err != nil {
248+
return nil, xerrors.Errorf("cannot chown logs dir: %w", err)
249+
}
250+
logFile, err := os.OpenFile(dockerUpLogFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
251+
if err != nil {
252+
return nil, xerrors.Errorf("cannot open docker-up log file: %w", err)
253+
}
254+
255+
if err := os.Chown(dockerUpLogFilePath, gitpodUID, gitpodGID); err != nil {
256+
_ = logFile.Close()
257+
return nil, xerrors.Errorf("cannot chown docker-up log file: %w", err)
258+
}
259+
return logFile, nil
260+
}

components/supervisor/pkg/supervisor/supervisor.go

Lines changed: 1 addition & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,11 @@ import (
5050
"github.com/gitpod-io/gitpod/common-go/analytics"
5151
"github.com/gitpod-io/gitpod/common-go/log"
5252
"github.com/gitpod-io/gitpod/common-go/pprof"
53-
"github.com/gitpod-io/gitpod/common-go/process"
5453
csapi "github.com/gitpod-io/gitpod/content-service/api"
5554
"github.com/gitpod-io/gitpod/content-service/pkg/executor"
5655
"github.com/gitpod-io/gitpod/content-service/pkg/git"
5756
gitpod "github.com/gitpod-io/gitpod/gitpod-protocol"
5857
"github.com/gitpod-io/gitpod/supervisor/api"
59-
"github.com/gitpod-io/gitpod/supervisor/pkg/activation"
6058
"github.com/gitpod-io/gitpod/supervisor/pkg/config"
6159
"github.com/gitpod-io/gitpod/supervisor/pkg/dropwriter"
6260
"github.com/gitpod-io/gitpod/supervisor/pkg/metrics"
@@ -409,7 +407,7 @@ func Run(options ...RunOption) {
409407

410408
if !opts.RunGP {
411409
wg.Add(1)
412-
go socketActivationForDocker(ctx, &wg, termMux, cfg, telemetry)
410+
go socketActivationForDocker(ctx, &wg, termMux, cfg, telemetry, notificationService)
413411
}
414412

415413
if cfg.isHeadless() {
@@ -1548,71 +1546,6 @@ func recordInitializerMetrics(path string, metrics *metrics.SupervisorMetrics) {
15481546
}
15491547
}
15501548

1551-
func socketActivationForDocker(ctx context.Context, wg *sync.WaitGroup, term *terminal.Mux, cfg *Config, w analytics.Writer) {
1552-
defer wg.Done()
1553-
1554-
fn := "/var/run/docker.sock"
1555-
l, err := net.Listen("unix", fn)
1556-
if err != nil {
1557-
log.WithError(err).Error("cannot provide Docker activation socket")
1558-
return
1559-
}
1560-
1561-
go func() {
1562-
<-ctx.Done()
1563-
l.Close()
1564-
}()
1565-
1566-
_ = os.Chown(fn, gitpodUID, gitpodGID)
1567-
for ctx.Err() == nil {
1568-
err = activation.Listen(ctx, l, func(socketFD *os.File) error {
1569-
defer socketFD.Close()
1570-
cmd := exec.Command("/usr/bin/docker-up")
1571-
cmd.Env = append(os.Environ(), "LISTEN_FDS=1")
1572-
cmd.ExtraFiles = []*os.File{socketFD}
1573-
alias, err := term.Start(cmd, terminal.TermOptions{
1574-
Annotations: map[string]string{
1575-
"gitpod.supervisor": "true",
1576-
},
1577-
LogToStdout: true,
1578-
})
1579-
w.Track(analytics.TrackMessage{
1580-
Identity: analytics.Identity{UserID: cfg.OwnerId},
1581-
Event: "gitpod_activate_docker",
1582-
Properties: map[string]interface{}{
1583-
"instanceId": cfg.WorkspaceInstanceID,
1584-
"workspaceId": cfg.WorkspaceID,
1585-
"debugWorkspace": cfg.isDebugWorkspace(),
1586-
},
1587-
})
1588-
if err != nil {
1589-
return err
1590-
}
1591-
pty, ok := term.Get(alias)
1592-
if !ok {
1593-
return errors.New("cannot find pty")
1594-
}
1595-
ptyCtx, cancel := context.WithCancel(context.Background())
1596-
go func(ptyCtx context.Context) {
1597-
select {
1598-
case <-ctx.Done():
1599-
_ = pty.Command.Process.Signal(syscall.SIGTERM)
1600-
case <-ptyCtx.Done():
1601-
}
1602-
}(ptyCtx)
1603-
_, err = pty.Wait()
1604-
cancel()
1605-
return err
1606-
})
1607-
if err != nil &&
1608-
!errors.Is(err, context.Canceled) &&
1609-
!process.IsNotChildProcess(err) &&
1610-
!strings.Contains(err.Error(), "signal: ") {
1611-
log.WithError(err).Error("cannot provide Docker activation socket")
1612-
}
1613-
}
1614-
}
1615-
16161549
type PerfAnalyzer struct {
16171550
label string
16181551
defs []int

0 commit comments

Comments
 (0)