Skip to content

Commit 65129c8

Browse files
committed
Add retry mechanisme for scaling errors
Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors
1 parent a537f9a commit 65129c8

File tree

5 files changed

+49
-30
lines changed

5 files changed

+49
-30
lines changed

examples/default/main.tf

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ module "runners" {
3030
webhook_secret = random_password.random.result
3131
}
3232

33-
webhook_lambda_zip = "lambdas-download/webhook.zip"
34-
runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip"
35-
runners_lambda_zip = "lambdas-download/runners.zip"
36-
enable_organization_runners = false
37-
runner_extra_labels = "default,example"
33+
# webhook_lambda_zip = "lambdas-download/webhook.zip"
34+
# runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip"
35+
# runners_lambda_zip = "lambdas-download/runners.zip"
36+
enable_organization_runners = true
37+
runner_extra_labels = "default,example"
3838

3939
# enable access to the runners via SSM
4040
enable_ssm_on_runners = true
@@ -61,8 +61,14 @@ module "runners" {
6161
instance_types = ["m5.large", "c5.large"]
6262

6363
# override delay of events in seconds
64-
delay_webhook_event = 0
64+
delay_webhook_event = 10
65+
//job_queue_retention_in_seconds = 600
66+
//job_queue_retention_in_seconds = 60
67+
runners_maximum_count = 1
6568

6669
# override scaling down
6770
scale_down_schedule_expression = "cron(* * * * ? *)"
71+
72+
enable_ephemeral_runners = true
73+
disable_check_wokflow_job_labels = true
6874
}

modules/runners/lambdas/runners/src/lambda.ts

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,29 @@ import { scaleUp } from './scale-runners/scale-up';
22
import { scaleDown } from './scale-runners/scale-down';
33
import { SQSEvent, ScheduledEvent, Context, Callback } from 'aws-lambda';
44
import { logger } from './scale-runners/logger';
5+
import ScaleError from './scale-runners/ScaleError';
56
import 'source-map-support/register';
67

78
export async function scaleUpHandler(event: SQSEvent, context: Context, callback: Callback): Promise<void> {
89
logger.setSettings({ requestId: context.awsRequestId });
910
logger.debug(JSON.stringify(event));
10-
try {
11-
for (const e of event.Records) {
12-
await scaleUp(e.eventSource, JSON.parse(e.body));
13-
}
14-
15-
callback(null);
16-
} catch (e) {
17-
logger.error(e);
18-
callback('Failed handling SQS event');
11+
// TODO find the a more elegant way :(
12+
if (event.Records.length != 1) {
13+
logger.warn('Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.');
14+
return new Promise((resolve) => resolve());
1915
}
16+
17+
return new Promise((resolve, reject) => {
18+
scaleUp(event.Records[0].eventSource, JSON.parse(event.Records[0].body))
19+
.then(() => resolve())
20+
.catch((e: Error) => {
21+
if (e instanceof ScaleError) {
22+
reject(e);
23+
} else {
24+
logger.warn('Ignoring error: ', e);
25+
}
26+
});
27+
});
2028
}
2129

2230
export async function scaleDownHandler(event: ScheduledEvent, context: Context, callback: Callback): Promise<void> {

modules/runners/lambdas/runners/src/scale-runners/runners.ts

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -85,16 +85,19 @@ export async function createRunner(runnerParameters: RunnerInputParameters, laun
8585
.runInstances(getInstanceParams(launchTemplateName, runnerParameters))
8686
.promise();
8787
logger.info('Created instance(s): ', runInstancesResponse.Instances?.map((i) => i.InstanceId).join(','));
88+
8889
const ssm = new SSM();
89-
runInstancesResponse.Instances?.forEach(async (i: EC2.Instance) => {
90-
await ssm
91-
.putParameter({
92-
Name: runnerParameters.environment + '-' + (i.InstanceId as string),
93-
Value: runnerParameters.runnerServiceConfig,
94-
Type: 'SecureString',
95-
})
96-
.promise();
97-
});
90+
if (runInstancesResponse.Instances != undefined) {
91+
for (let i = 0; i < runInstancesResponse.Instances?.length; i++) {
92+
await ssm
93+
.putParameter({
94+
Name: runnerParameters.environment + '-' + (runInstancesResponse.Instances[i].InstanceId as string),
95+
Value: runnerParameters.runnerServiceConfig,
96+
Type: 'SecureString',
97+
})
98+
.promise();
99+
}
100+
}
98101
}
99102

100103
function getInstanceParams(

modules/runners/lambdas/runners/src/scale-runners/scale-up.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { createOctoClient, createGithubAppAuth, createGithubInstallationAuth } f
33
import yn from 'yn';
44
import { Octokit } from '@octokit/rest';
55
import { logger as rootLogger } from './logger';
6+
import ScaleError from './ScaleError';
67

78
const logger = rootLogger.getChildLogger();
89

@@ -57,17 +58,14 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
5758
const runnerType = enableOrgLevel ? 'Org' : 'Repo';
5859
const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`;
5960

60-
const isQueued = await getJobStatus(githubInstallationClient, payload);
61-
// ephemeral runners should be created on every event, will only work with `workflow_job` events.
62-
if (ephemeral || isQueued) {
61+
if (ephemeral || (await getJobStatus(githubInstallationClient, payload))) {
6362
const currentRunners = await listEC2Runners({
6463
environment,
6564
runnerType,
6665
runnerOwner,
6766
});
6867
logger.info(`${runnerType} ${runnerOwner} has ${currentRunners.length}/${maximumRunners} runners`);
6968

70-
// TODO: how to handle the event if the max is reached in case of ephemeral runners
7169
if (currentRunners.length < maximumRunners) {
7270
console.info(`Attempting to launch a new runner`);
7371
// create token
@@ -94,7 +92,10 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
9492
runnerType,
9593
});
9694
} else {
97-
logger.info('No runner will be created, maximum number of runners reached.');
95+
logger.warn('No runner created: maximum number of runners reached.');
96+
if (ephemeral) {
97+
throw new ScaleError('No runners create: maximum of runners reached.');
98+
}
9899
}
99100
}
100101
}
@@ -139,6 +140,6 @@ export async function createRunnerLoop(runnerParameters: RunnerInputParameters):
139140
}
140141
}
141142
if (launched == false) {
142-
throw Error('All launch templates failed');
143+
throw new ScaleError('All launch templates failed');
143144
}
144145
}

modules/runners/scale-up.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ resource "aws_cloudwatch_log_group" "scale_up" {
5050
resource "aws_lambda_event_source_mapping" "scale_up" {
5151
event_source_arn = var.sqs_build_queue.arn
5252
function_name = aws_lambda_function.scale_up.arn
53+
batch_size = 1
5354
}
5455

5556
resource "aws_lambda_permission" "scale_runners_lambda" {

0 commit comments

Comments
 (0)