Skip to content

Commit 18db960

Browse files
add retries to getToken
1 parent bfb0135 commit 18db960

File tree

1 file changed

+37
-23
lines changed

1 file changed

+37
-23
lines changed

server/job_rpc.go

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,32 +1267,46 @@ func getToken(ctx context.Context, respTimeout time.Duration, orchUrl, capabilit
12671267
return nil, err
12681268
}
12691269

1270-
resp, err := sendJobReqWithTimeout(tokenReq, respTimeout)
1271-
if err != nil {
1272-
clog.Errorf(ctx, "failed to get token from Orchestrator err=%v", err)
1273-
return nil, err
1274-
}
1275-
defer resp.Body.Close()
1276-
1277-
if resp.StatusCode != http.StatusOK {
1278-
clog.Errorf(ctx, "Failed to get token from Orchestrator %v err=%v", orchUrl, err)
1279-
return nil, fmt.Errorf("failed to get token from Orchestrator")
1280-
}
1281-
1282-
latency := time.Since(start)
1283-
clog.V(common.DEBUG).Infof(ctx, "Received job token from uri=%v, latency=%v", orchUrl, latency)
1270+
var resp *http.Response
1271+
var token []byte
1272+
var jobToken core.JobToken
1273+
var attempt int
1274+
var backoff time.Duration = 100 * time.Millisecond
1275+
deadline := time.Now().Add(respTimeout)
12841276

1285-
token, err := io.ReadAll(resp.Body)
1286-
if err != nil {
1287-
clog.Errorf(ctx, "Failed to read token from Orchestrator %v err=%v", orchUrl, err)
1288-
return nil, err
1277+
for attempt = 0; attempt < 3; attempt++ {
1278+
resp, err = sendJobReqWithTimeout(tokenReq, respTimeout)
1279+
if err != nil {
1280+
clog.Errorf(ctx, "failed to get token from Orchestrator (attempt %d) err=%v", attempt+1, err)
1281+
} else if resp.StatusCode != http.StatusOK {
1282+
clog.Errorf(ctx, "Failed to get token from Orchestrator %v status=%v (attempt %d)", orchUrl, resp.StatusCode, attempt+1)
1283+
} else {
1284+
defer resp.Body.Close()
1285+
latency := time.Since(start)
1286+
clog.V(common.DEBUG).Infof(ctx, "Received job token from uri=%v, latency=%v", orchUrl, latency)
1287+
token, err = io.ReadAll(resp.Body)
1288+
if err != nil {
1289+
clog.Errorf(ctx, "Failed to read token from Orchestrator %v err=%v", orchUrl, err)
1290+
} else {
1291+
err = json.Unmarshal(token, &jobToken)
1292+
if err != nil {
1293+
clog.Errorf(ctx, "Failed to unmarshal token from Orchestrator %v err=%v", orchUrl, err)
1294+
} else {
1295+
return &jobToken, nil
1296+
}
1297+
}
1298+
}
1299+
// If not last attempt and time remains, backoff
1300+
if time.Now().Add(backoff).Before(deadline) && attempt < 2 {
1301+
time.Sleep(backoff)
1302+
backoff *= 2
1303+
} else {
1304+
break
1305+
}
12891306
}
1290-
var jobToken core.JobToken
1291-
err = json.Unmarshal(token, &jobToken)
1307+
// All attempts failed
12921308
if err != nil {
1293-
clog.Errorf(ctx, "Failed to unmarshal token from Orchestrator %v err=%v", orchUrl, err)
12941309
return nil, err
12951310
}
1296-
1297-
return &jobToken, nil
1311+
return nil, fmt.Errorf("failed to get token from Orchestrator after %d attempts", attempt)
12981312
}

0 commit comments

Comments
 (0)