From 0b5d97f4ce4fb7e66b0b0813b33842e3aebea76b Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:49:39 +0100 Subject: [PATCH 01/31] add option ephemeral runners --- .ci/build-yarn.sh | 9 +++ README.md | 1 + examples/default/main.tf | 2 +- examples/ephemeral/.terraform.lock.hcl | 57 +++++++++++++++++ examples/ephemeral/README.md | 31 ++++++++++ examples/ephemeral/lambdas-download/main.tf | 25 ++++++++ examples/ephemeral/main.tf | 62 +++++++++++++++++++ examples/ephemeral/outputs.tf | 15 +++++ examples/ephemeral/providers.tf | 3 + examples/ephemeral/variables.tf | 12 ++++ examples/ephemeral/versions.tf | 15 +++++ examples/ephemeral/vpc.tf | 7 +++ main.tf | 1 + modules/runners/README.md | 13 ++-- .../src/scale-runners/scale-up.test.ts | 16 ++--- .../runners/src/scale-runners/scale-up.ts | 17 +++-- modules/runners/policies-runner.tf | 7 +++ modules/runners/policies/instance-ec2.json | 15 +++++ modules/runners/scale-up.tf | 1 + modules/runners/variables.tf | 5 ++ variables.tf | 5 ++ 21 files changed, 300 insertions(+), 19 deletions(-) create mode 100755 .ci/build-yarn.sh create mode 100644 examples/ephemeral/.terraform.lock.hcl create mode 100644 examples/ephemeral/README.md create mode 100644 examples/ephemeral/lambdas-download/main.tf create mode 100644 examples/ephemeral/main.tf create mode 100644 examples/ephemeral/outputs.tf create mode 100644 examples/ephemeral/providers.tf create mode 100644 examples/ephemeral/variables.tf create mode 100644 examples/ephemeral/versions.tf create mode 100644 examples/ephemeral/vpc.tf create mode 100644 modules/runners/policies/instance-ec2.json diff --git a/.ci/build-yarn.sh b/.ci/build-yarn.sh new file mode 100755 index 0000000000..b0f847113d --- /dev/null +++ b/.ci/build-yarn.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +lambdaSrcDirs=("modules/runner-binaries-syncer/lambdas/runner-binaries-syncer" "modules/runners/lambdas/runners" "modules/webhook/lambdas/webhook") +repoRoot=$(dirname $(dirname $(realpath ${BASH_SOURCE[0]}))) + +for lambdaDir in ${lambdaSrcDirs[@]}; do + cd "$repoRoot/${lambdaDir}" + yarn && yarn run dist +done diff --git a/README.md b/README.md index b38102056b..58af8d4dc5 100644 --- a/README.md +++ b/README.md @@ -382,6 +382,7 @@ In case the setup does not work as intended follow the trace of events: | [delay\_webhook\_event](#input\_delay\_webhook\_event) | The number of seconds the event accepted by the webhook is invisible on the queue before the scale up lambda will receive the event. | `number` | `30` | no | | [disable\_check\_wokflow\_job\_labels](#input\_disable\_check\_wokflow\_job\_labels) | Disable the the check of workflow labels for received workflow job events. | `bool` | `false` | no | | [enable\_cloudwatch\_agent](#input\_enable\_cloudwatch\_agent) | Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`. | `bool` | `true` | no | +| [enable\_ephemeral\_runners](#input\_enable\_ephemeral\_runners) | Enable ephemeral runners, runners will only be used once. | `bool` | `false` | no | | [enable\_organization\_runners](#input\_enable\_organization\_runners) | Register runners to organization, instead of repo level | `bool` | `false` | no | | [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | `false` | no | | [enabled\_userdata](#input\_enabled\_userdata) | Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI | `bool` | `true` | no | diff --git a/examples/default/main.tf b/examples/default/main.tf index 2a5ec1edee..6f6d617808 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -61,7 +61,7 @@ module "runners" { instance_types = ["m5.large", "c5.large"] # override delay of events in seconds - delay_webhook_event = 5 + delay_webhook_event = 0 # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" diff --git a/examples/ephemeral/.terraform.lock.hcl b/examples/ephemeral/.terraform.lock.hcl new file mode 100644 index 0000000000..d940521fcb --- /dev/null +++ b/examples/ephemeral/.terraform.lock.hcl @@ -0,0 +1,57 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "3.61.0" + constraints = ">= 3.27.0" + hashes = [ + "h1:fpZ14qQnn+uEOO2ZOlBFHgty48Ol8IOwd+ewxZ4z3zc=", + "zh:0483ca802ddb0ae4f73144b4357ba72242c6e2641aeb460b1aa9a6f6965464b0", + "zh:274712214ebeb0c1269cbc468e5705bb5741dc45b05c05e9793ca97f22a1baa1", + "zh:3c6bd97a2ca809469ae38f6893348386c476cb3065b120b785353c1507401adf", + "zh:53dd41a9aed9860adbbeeb71a23e4f8195c656fd15a02c90fa2d302a5f577d8c", + "zh:65c639c547b97bc880fd83e65511c0f4bbfc91b63cada3b8c0d5776444221700", + "zh:a2769e19137ff480c1dd3e4f248e832df90fb6930a22c66264d9793895161714", + "zh:a5897a99332cc0071e46a71359b86a8e53ab09c1453e94cd7cf45a0b577ff590", + "zh:bdc2353642d16d8e2437a9015cd4216a1772be9736645cc17d1a197480e2b5b7", + "zh:cbeace1deae938f6c0aca3734e6088f3633ca09611aff701c15cb6d42f2b918a", + "zh:d33ca19012aabd98cc03fdeccd0bd5ce56e28f61a1dfbb2eea88e89487de7fb3", + "zh:d548b29a864b0687e85e8a993f208e25e3ecc40fcc5b671e1985754b32fdd658", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.1.0" + hashes = [ + "h1:KfieWtVyGWwplSoLIB5usKAUnrIkDQBkWaR5TI+4WYg=", + "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", + "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", + "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", + "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", + "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", + "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", + "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", + "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", + "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", + "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", + "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.1.0" + hashes = [ + "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=", + "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc", + "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626", + "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff", + "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2", + "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992", + "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427", + "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc", + "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f", + "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b", + "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7", + "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a", + ] +} diff --git a/examples/ephemeral/README.md b/examples/ephemeral/README.md new file mode 100644 index 0000000000..b2b177bc9f --- /dev/null +++ b/examples/ephemeral/README.md @@ -0,0 +1,31 @@ +# Action runners deployment ephemeral example + +This module shows how to create GitHub action runners. Lambda release will be downloaded from GitHub. + +## Usages + +Steps for the full setup, such as creating a GitHub app can be found in the root module's [README](../../README.md). First download the Lambda releases from GitHub. Alternatively you can build the lambdas locally with Node or Docker, there is a simple build script in `/.ci/build.sh`. In the `main.tf` you can simply remove the location of the lambda zip files, the default location will work in this case. + +> Ensure you have set the version in `lambdas-download/main.tf` for running the example. The version needs to be set to a GitHub release version, see https://github.com/philips-labs/terraform-aws-github-runner/releases + +```bash +cd lambdas-download +terraform init +terraform apply +cd .. +``` + +Before running Terraform, ensure the GitHub app is configured. See the [configuration details](../../README.md#usages) for more details. + +```bash +terraform init +terraform apply +``` + +You can receive the webhook details by running: + +```bash +terraform output -raw webhook_secret +``` + +Be-aware some shells will print some end of line character `%`. \ No newline at end of file diff --git a/examples/ephemeral/lambdas-download/main.tf b/examples/ephemeral/lambdas-download/main.tf new file mode 100644 index 0000000000..87f31bd8a9 --- /dev/null +++ b/examples/ephemeral/lambdas-download/main.tf @@ -0,0 +1,25 @@ +locals { + version = "" +} + +module "lambdas" { + source = "../../../modules/download-lambda" + lambdas = [ + { + name = "webhook" + tag = local.version + }, + { + name = "runners" + tag = local.version + }, + { + name = "runner-binaries-syncer" + tag = local.version + } + ] +} + +output "files" { + value = module.lambdas.files +} diff --git a/examples/ephemeral/main.tf b/examples/ephemeral/main.tf new file mode 100644 index 0000000000..765b15b4ef --- /dev/null +++ b/examples/ephemeral/main.tf @@ -0,0 +1,62 @@ +locals { + environment = "default" + aws_region = "eu-west-1" +} + +resource "random_password" "random" { + length = 28 +} + + +################################################################################ +### Hybrid acccount +################################################################################ + +module "runners" { + source = "../../" + create_service_linked_role_spot = true + aws_region = local.aws_region + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + environment = local.environment + tags = { + Project = "ProjectX" + } + + github_app = { + key_base64 = var.github_app_key_base64 + id = var.github_app_id + webhook_secret = random_password.random.result + } + + # webhook_lambda_zip = "lambdas-download/webhook.zip" + # runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip" + # runners_lambda_zip = "lambdas-download/runners.zip" + enable_organization_runners = true + runner_extra_labels = "default,example" + + # enable access to the runners via SSM + enable_ssm_on_runners = true + + # Uncommet idle config to have idle runners from 9 to 5 in time zone Amsterdam + # idle_config = [{ + # cron = "* * 9-17 * * *" + # timeZone = "Europe/Amsterdam" + # idleCount = 1 + # }] + + # Let the module manage the service linked role + # create_service_linked_role_spot = true + + instance_types = ["m5d.large"] + + # override delay of events in seconds + delay_webhook_event = 0 + + # override scaling down + scale_down_schedule_expression = "cron(* * * * ? *)" + + enable_ephemeral_runners = true + +} diff --git a/examples/ephemeral/outputs.tf b/examples/ephemeral/outputs.tf new file mode 100644 index 0000000000..d6886efe36 --- /dev/null +++ b/examples/ephemeral/outputs.tf @@ -0,0 +1,15 @@ +output "runners" { + value = { + lambda_syncer_name = module.runners.binaries_syncer.lambda.function_name + } +} + +output "webhook_endpoint" { + value = module.runners.webhook.endpoint +} + +output "webhook_secret" { + sensitive = true + value = random_password.random.result +} + diff --git a/examples/ephemeral/providers.tf b/examples/ephemeral/providers.tf new file mode 100644 index 0000000000..b6c81d5415 --- /dev/null +++ b/examples/ephemeral/providers.tf @@ -0,0 +1,3 @@ +provider "aws" { + region = local.aws_region +} diff --git a/examples/ephemeral/variables.tf b/examples/ephemeral/variables.tf new file mode 100644 index 0000000000..e43d3efccf --- /dev/null +++ b/examples/ephemeral/variables.tf @@ -0,0 +1,12 @@ + +variable "github_app_key_base64" {} + +variable "github_app_id" {} + +variable "github_app_client_id" {} + +variable "github_app_client_secret" {} + +variable "owner" { + +} diff --git a/examples/ephemeral/versions.tf b/examples/ephemeral/versions.tf new file mode 100644 index 0000000000..c96d0eee84 --- /dev/null +++ b/examples/ephemeral/versions.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 3.27" + } + local = { + source = "hashicorp/local" + } + random = { + source = "hashicorp/random" + } + } + required_version = ">= 0.14" +} diff --git a/examples/ephemeral/vpc.tf b/examples/ephemeral/vpc.tf new file mode 100644 index 0000000000..a7d21422f1 --- /dev/null +++ b/examples/ephemeral/vpc.tf @@ -0,0 +1,7 @@ +module "vpc" { + source = "git::https://github.com/philips-software/terraform-aws-vpc.git?ref=2.2.0" + + environment = local.environment + aws_region = local.aws_region + create_private_hosted_zone = false +} diff --git a/main.tf b/main.tf index 48f5d7a88b..595e06cde9 100644 --- a/main.tf +++ b/main.tf @@ -92,6 +92,7 @@ module "runners" { sqs_build_queue = aws_sqs_queue.queued_builds github_app_parameters = local.github_app_parameters enable_organization_runners = var.enable_organization_runners + enable_ephemeral_runners = var.enable_ephemeral_runners scale_down_schedule_expression = var.scale_down_schedule_expression minimum_running_time_in_minutes = var.minimum_running_time_in_minutes runner_boot_time_in_minutes = var.runner_boot_time_in_minutes diff --git a/modules/runners/README.md b/modules/runners/README.md index 05cb8b2c42..6f9772e561 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -81,6 +81,7 @@ No modules. | [aws_iam_role_policy.cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.describe_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.dist_bucket](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.ec2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.runner_session_manager_aws_managed](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.scale_down_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | @@ -110,7 +111,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [ami\_filter](#input\_ami\_filter) | Map of lists used to create the AMI filter for the action runner AMI. | `map(list(string))` |
{
"name": [
"amzn2-ami-hvm-2.*-x86_64-ebs"
]
}
| no | +| [ami\_filter](#input\_ami\_filter) | Map of lists used to create the AMI filter for the action runner AMI. | `map(list(string))` | `null` | no | | [ami\_owners](#input\_ami\_owners) | The list of owners used to select the AMI of action runner instances. | `list(string)` |
[
"amazon"
]
| no | | [aws\_region](#input\_aws\_region) | AWS region. | `string` | n/a | yes | | [block\_device\_mappings](#input\_block\_device\_mappings) | The EC2 instance block device configuration. Takes the following keys: `device_name`, `delete_on_termination`, `volume_type`, `volume_size`, `encrypted`, `iops` | `map(string)` | `{}` | no | @@ -118,8 +119,10 @@ No modules. | [create\_service\_linked\_role\_spot](#input\_create\_service\_linked\_role\_spot) | (optional) create the service linked role for spot instances that is required by the scale-up lambda. | `bool` | `false` | no | | [egress\_rules](#input\_egress\_rules) | List of egress rules for the GitHub runner instances. |
list(object({
cidr_blocks = list(string)
ipv6_cidr_blocks = list(string)
prefix_list_ids = list(string)
from_port = number
protocol = string
security_groups = list(string)
self = bool
to_port = number
description = string
}))
|
[
{
"cidr_blocks": [
"0.0.0.0/0"
],
"description": null,
"from_port": 0,
"ipv6_cidr_blocks": [
"::/0"
],
"prefix_list_ids": null,
"protocol": "-1",
"security_groups": null,
"self": null,
"to_port": 0
}
]
| no | | [enable\_cloudwatch\_agent](#input\_enable\_cloudwatch\_agent) | Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`. | `bool` | `true` | no | +| [enable\_ephemeral\_runners](#input\_enable\_ephemeral\_runners) | Enable ephemeral runners, runners will only be used once. | `bool` | `false` | no | | [enable\_organization\_runners](#input\_enable\_organization\_runners) | n/a | `bool` | n/a | yes | | [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access to the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | n/a | yes | +| [enabled\_userdata](#input\_enabled\_userdata) | Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI | `bool` | `true` | no | | [environment](#input\_environment) | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | | [ghes\_ssl\_verify](#input\_ghes\_ssl\_verify) | GitHub Enterprise SSL verification. Set to 'false' when custom certificate (chains) is used for GitHub Enterprise Server (insecure). | `bool` | `true` | no | | [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL. DO NOT SET IF USING PUBLIC GITHUB | `string` | `null` | no | @@ -127,7 +130,7 @@ No modules. | [idle\_config](#input\_idle\_config) | List of time period that can be defined as cron expression to keep a minimum amount of runners active instead of scaling down to 0. By defining this list you can ensure that in time periods that match the cron expression within 5 seconds a runner is kept idle. |
list(object({
cron = string
timeZone = string
idleCount = number
}))
| `[]` | no | | [instance\_profile\_path](#input\_instance\_profile\_path) | The path that will be added to the instance\_profile, if not set the environment name will be used. | `string` | `null` | no | | [instance\_type](#input\_instance\_type) | [DEPRECATED] See instance\_types. | `string` | `"m5.large"` | no | -| [instance\_types](#input\_instance\_types) | List of instance types for the action runner. | `list(string)` | `null` | no | +| [instance\_types](#input\_instance\_types) | List of instance types for the action runner. Defaults are based on runner\_os (amzn2 for linux and Windows Server Core for win). | `list(string)` | `null` | no | | [key\_name](#input\_key\_name) | Key pair name | `string` | `null` | no | | [kms\_key\_arn](#input\_kms\_key\_arn) | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | | [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `any` | `null` | no | @@ -141,7 +144,7 @@ No modules. | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | | [market\_options](#input\_market\_options) | Market options for the action runner instances. | `string` | `"spot"` | no | | [metadata\_options](#input\_metadata\_options) | Metadata options for the ec2 runner instances. | `map(any)` |
{
"http_endpoint": "enabled",
"http_put_response_hop_limit": 1,
"http_tokens": "optional"
}
| no | -| [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated if non busy. | `number` | `5` | no | +| [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated if non busy. If not set the default is calculated based on the OS. | `number` | `null` | no | | [overrides](#input\_overrides) | This map provides the possibility to override some defaults. The following attributes are supported: `name_sg` overrides the `Name` tag for all security groups created by this module. `name_runner_agent_instance` overrides the `Name` tag for the ec2 instance defined in the auto launch configuration. `name_docker_machine_runners` overrides the `Name` tag spot instances created by the runner agent. | `map(string)` |
{
"name_runner": "",
"name_sg": ""
}
| no | | [role\_path](#input\_role\_path) | The path that will be added to the role; if not set, the environment name will be used. | `string` | `null` | no | | [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created role for the lambda. | `string` | `null` | no | @@ -153,7 +156,8 @@ No modules. | [runner\_extra\_labels](#input\_runner\_extra\_labels) | Extra labels for the runners (GitHub). Separate each label by a comma | `string` | `""` | no | | [runner\_group\_name](#input\_runner\_group\_name) | Name of the runner group. | `string` | `"Default"` | no | | [runner\_iam\_role\_managed\_policy\_arns](#input\_runner\_iam\_role\_managed\_policy\_arns) | Attach AWS or customer-managed IAM policies (by ARN) to the runner IAM role | `list(string)` | `[]` | no | -| [runner\_log\_files](#input\_runner\_log\_files) | (optional) List of logfiles to send to CloudWatch, will only be used if `enable_cloudwatch_agent` is set to true. Object description: `log_group_name`: Name of the log group, `prefix_log_group`: If true, the log group name will be prefixed with `/github-self-hosted-runners/`, `file_path`: path to the log file, `log_stream_name`: name of the log stream. |
list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
}))
|
[
{
"file_path": "/var/log/messages",
"log_group_name": "messages",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
},
{
"file_path": "/var/log/user-data.log",
"log_group_name": "user_data",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
},
{
"file_path": "/var/log/runner-startup.log",
"log_group_name": "runner-startup",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
},
{
"file_path": "/home/ec2-user/actions-runner/_diag/Runner_**.log",
"log_group_name": "runner",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
}
]
| no | +| [runner\_log\_files](#input\_runner\_log\_files) | (optional) List of logfiles to send to CloudWatch, will only be used if `enable_cloudwatch_agent` is set to true. Object description: `log_group_name`: Name of the log group, `prefix_log_group`: If true, the log group name will be prefixed with `/github-self-hosted-runners/`, `file_path`: path to the log file, `log_stream_name`: name of the log stream. |
list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
}))
| `null` | no | +| [runner\_os](#input\_runner\_os) | The EC2 Operating System type to use for action runner instances (linux,win). | `string` | `"linux"` | no | | [runners\_lambda\_s3\_key](#input\_runners\_lambda\_s3\_key) | S3 key for runners lambda function. Required if using S3 bucket to specify lambdas. | `any` | `null` | no | | [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for runners lambda function. Useful if S3 versioning is enabled on source bucket. | `any` | `null` | no | | [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. | `number` | `3` | no | @@ -164,7 +168,6 @@ No modules. | [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. |
object({
arn = string
})
| n/a | yes | | [subnet\_ids](#input\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | n/a | yes | | [tags](#input\_tags) | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no | -| [enabled\_userdata](#input\_enabled_userdata) | Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI | `bool` | `true` | no | | [userdata\_post\_install](#input\_userdata\_post\_install) | User-data script snippet to insert after GitHub action runner install | `string` | `""` | no | | [userdata\_pre\_install](#input\_userdata\_pre\_install) | User-data script snippet to insert before GitHub action runner install | `string` | `""` | no | | [userdata\_template](#input\_userdata\_template) | Alternative user-data template, replacing the default template. By providing your own user\_data you have to take care of installing all required software, including the action runner. Variables userdata\_pre/post\_install are ignored. | `string` | `null` | no | diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index 82191bc427..29a08d84c9 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -53,7 +53,7 @@ const cleanEnv = process.env; const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = { environment: 'unit-test-environment', - runnerServiceConfig: `--url https://github.enterprise.something/${TEST_DATA.repositoryOwner} --token 1234abcd `, + runnerServiceConfig: `--url https://github.enterprise.something/${TEST_DATA.repositoryOwner} --token 1234abcd`, runnerType: 'Org', runnerOwner: TEST_DATA.repositoryOwner, }; @@ -226,7 +226,7 @@ describe('scaleUp with GHES', () => { process.env.RUNNER_GROUP_NAME = 'TEST_GROUP'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); expectedRunnerParams.runnerServiceConfig = - expectedRunnerParams.runnerServiceConfig + `--labels label1,label2 --runnergroup TEST_GROUP`; + expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2 --runnergroup TEST_GROUP`; expect(createRunner).toBeCalledWith(expectedRunnerParams, 'lt-1'); }); @@ -260,7 +260,7 @@ describe('scaleUp with GHES', () => { expectedRunnerParams.runnerServiceConfig = `--url ` + `https://github.enterprise.something/${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName} ` + - `--token 1234abcd `; + `--token 1234abcd`; }); it('gets the current repo level runners', async () => { @@ -407,7 +407,7 @@ describe('scaleUp with public GH', () => { process.env.ENABLE_ORGANIZATION_RUNNERS = 'true'; expectedRunnerParams = { ...EXPECTED_RUNNER_PARAMS }; expectedRunnerParams.runnerServiceConfig = - `--url https://github.com/${TEST_DATA.repositoryOwner} ` + `--token 1234abcd `; + `--url https://github.com/${TEST_DATA.repositoryOwner} ` + `--token 1234abcd`; }); it('gets the current org level runners', async () => { @@ -449,7 +449,7 @@ describe('scaleUp with public GH', () => { process.env.RUNNER_GROUP_NAME = 'TEST_GROUP'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); expectedRunnerParams.runnerServiceConfig = - expectedRunnerParams.runnerServiceConfig + `--labels label1,label2 --runnergroup TEST_GROUP`; + expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2 --runnergroup TEST_GROUP`; expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); }); @@ -470,7 +470,7 @@ describe('scaleUp with public GH', () => { expectedRunnerParams.runnerType = 'Repo'; expectedRunnerParams.runnerOwner = `${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName}`; expectedRunnerParams.runnerServiceConfig = - `--url https://github.com/${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName} ` + `--token 1234abcd `; + `--url https://github.com/${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName} ` + `--token 1234abcd`; }); it('gets the current repo level runners', async () => { @@ -521,7 +521,7 @@ describe('scaleUp with public GH', () => { it('creates a runner with correct config and labels', async () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); }); @@ -529,7 +529,7 @@ describe('scaleUp with public GH', () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); }); diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 6e6c40f606..bd2dce6f86 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -22,6 +22,10 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const runnerGroup = process.env.RUNNER_GROUP_NAME; const environment = process.env.ENVIRONMENT; const ghesBaseUrl = process.env.GHES_URL; + const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); + + // TODO: handle case event is check_run and ephemeralEnabled = true + const ephemeral = ephemeralEnabled && payload.eventType === 'workflow_job'; const runnerType = enableOrgLevel ? 'Org' : 'Repo'; const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`; @@ -61,7 +65,8 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const githubInstallationClient = await createOctoClient(ghAuth.token, ghesApiUrl); const isQueued = await getJobStatus(githubInstallationClient, payload); - if (isQueued) { + // ephemeral runners should be created on every event, will only work with `workflow_job` events. + if (ephemeral || isQueued) { const currentRunners = await listEC2Runners({ environment, runnerType, @@ -69,6 +74,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage }); logger.info(`Current runners: ${currentRunners.length} of ${maximumRunners}`, LogFields.print()); + // TODO: how to handle the event if the max is reached in case of ephemeral runners if (currentRunners.length < maximumRunners) { logger.info(`Attempting to launch a new runner`, LogFields.print()); // create token @@ -81,15 +87,16 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const token = registrationToken.data.token; const labelsArgument = runnerExtraLabels !== undefined ? `--labels ${runnerExtraLabels}` : ''; - const runnerGroupArgument = runnerGroup !== undefined ? ` --runnergroup ${runnerGroup}` : ''; + const runnerGroupArgument = runnerGroup !== undefined ? `--runnergroup ${runnerGroup}` : ''; const configBaseUrl = ghesBaseUrl ? ghesBaseUrl : 'https://github.com'; + const ephemeralArgument = ephemeral ? '--ephemeral' : ''; + const runnerArgs = `--token ${token} ${labelsArgument} ${ephemeralArgument}`.trim(); await createRunnerLoop({ environment, runnerServiceConfig: enableOrgLevel - ? `--url ${configBaseUrl}/${payload.repositoryOwner} --token ${token} ${labelsArgument}${runnerGroupArgument}` - : `--url ${configBaseUrl}/${payload.repositoryOwner}/${payload.repositoryName} ` + - `--token ${token} ${labelsArgument}`, + ? `--url ${configBaseUrl}/${payload.repositoryOwner} ${runnerArgs} ${runnerGroupArgument}`.trim() + : `--url ${configBaseUrl}/${payload.repositoryOwner}/${payload.repositoryName} ${runnerArgs}`.trim(), runnerOwner, runnerType, }); diff --git a/modules/runners/policies-runner.tf b/modules/runners/policies-runner.tf index 2ac1b87454..dc90d47b0b 100644 --- a/modules/runners/policies-runner.tf +++ b/modules/runners/policies-runner.tf @@ -54,4 +54,11 @@ resource "aws_iam_role_policy_attachment" "managed_policies" { policy_arn = element(var.runner_iam_role_managed_policy_arns, count.index) } + +resource "aws_iam_role_policy" "ec2" { + name = "ec2" + role = aws_iam_role.runner.name + policy = templatefile("${path.module}/policies/instance-ec2.json", {}) +} + // see also logging.tf for logging and metrics policies diff --git a/modules/runners/policies/instance-ec2.json b/modules/runners/policies/instance-ec2.json new file mode 100644 index 0000000000..b2a22cb2e2 --- /dev/null +++ b/modules/runners/policies/instance-ec2.json @@ -0,0 +1,15 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "ec2:TerminateInstances", + "Resource": "*", + "Condition": { + "StringEquals": { + "ec2:ResourceTag/Application": "github-action-runner" + } + } + } + ] +} \ No newline at end of file diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf index 486ce3de9f..77057dd400 100644 --- a/modules/runners/scale-up.tf +++ b/modules/runners/scale-up.tf @@ -28,6 +28,7 @@ resource "aws_lambda_function" "scale_up" { RUNNER_GROUP_NAME = var.runner_group_name RUNNERS_MAXIMUM_COUNT = var.runners_maximum_count SUBNET_IDS = join(",", var.subnet_ids) + ENABLE_EPHEMERAL_RUNNERS = var.enable_ephemeral_runners } } diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index 80416b7894..c46e88340a 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -416,5 +416,10 @@ variable "metadata_options" { http_tokens = "optional" http_put_response_hop_limit = 1 } +} +variable "enable_ephemeral_runners" { + description = "Enable ephemeral runners, runners will only be used once." + type = bool + default = false } diff --git a/variables.tf b/variables.tf index 103b400937..df98e1c79d 100644 --- a/variables.tf +++ b/variables.tf @@ -451,7 +451,12 @@ variable "runner_metadata_options" { http_tokens = "optional" http_put_response_hop_limit = 1 } +} +variable "enable_ephemeral_runners" { + description = "Enable ephemeral runners, runners will only be used once." + type = bool + default = false } variable "runner_os" { From 585d5ed7dbbe7b22e15b3b8d9b84472bb4713acc Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 3 Nov 2021 22:20:07 +0100 Subject: [PATCH 02/31] fix tests --- .../lambdas/runners/src/scale-runners/scale-up.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index 29a08d84c9..40c601fd41 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -326,7 +326,7 @@ describe('scaleUp with GHES', () => { it('creates a runner with correct config and labels', async () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, 'lt-1'); }); @@ -334,7 +334,7 @@ describe('scaleUp with GHES', () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, 'lt-1'); }); From 89e3a373416b15db277bdf510ced87d36de2a4a4 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Tue, 30 Nov 2021 22:51:50 +0100 Subject: [PATCH 03/31] Add retry mechanisme for scaling errors --- .../lambdas/runners/src/scale-runners/ScaleError.ts | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 modules/runners/lambdas/runners/src/scale-runners/ScaleError.ts diff --git a/modules/runners/lambdas/runners/src/scale-runners/ScaleError.ts b/modules/runners/lambdas/runners/src/scale-runners/ScaleError.ts new file mode 100644 index 0000000000..d7e71f8c33 --- /dev/null +++ b/modules/runners/lambdas/runners/src/scale-runners/ScaleError.ts @@ -0,0 +1,9 @@ +class ScaleError extends Error { + constructor(public message: string) { + super(message); + this.name = 'ScaleError'; + this.stack = new Error().stack; + } +} + +export default ScaleError; From d70e338236fb5b1843528974dc0d8215df0e2374 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:51:33 +0100 Subject: [PATCH 04/31] Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors --- examples/default/main.tf | 18 ++++++++----- modules/runners/lambdas/runners/src/lambda.ts | 26 ++++++++++++------- .../runners/src/scale-runners/runners.ts | 20 +++++++------- .../runners/src/scale-runners/scale-up.ts | 11 ++++---- modules/runners/scale-up.tf | 1 + 5 files changed, 47 insertions(+), 29 deletions(-) diff --git a/examples/default/main.tf b/examples/default/main.tf index 6f6d617808..a8c40c47ee 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -30,11 +30,11 @@ module "runners" { webhook_secret = random_id.random.hex } - webhook_lambda_zip = "lambdas-download/webhook.zip" - runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip" - runners_lambda_zip = "lambdas-download/runners.zip" - enable_organization_runners = false - runner_extra_labels = "default,example" + # webhook_lambda_zip = "lambdas-download/webhook.zip" + # runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip" + # runners_lambda_zip = "lambdas-download/runners.zip" + enable_organization_runners = true + runner_extra_labels = "default,example" # enable access to the runners via SSM enable_ssm_on_runners = true @@ -61,8 +61,14 @@ module "runners" { instance_types = ["m5.large", "c5.large"] # override delay of events in seconds - delay_webhook_event = 0 + delay_webhook_event = 10 + //job_queue_retention_in_seconds = 600 + //job_queue_retention_in_seconds = 60 + runners_maximum_count = 1 # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" + + enable_ephemeral_runners = true + disable_check_wokflow_job_labels = true } diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index a784c0d059..cd394f6ac2 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -2,21 +2,29 @@ import { scaleUp } from './scale-runners/scale-up'; import { scaleDown } from './scale-runners/scale-down'; import { SQSEvent, ScheduledEvent, Context, Callback } from 'aws-lambda'; import { logger } from './scale-runners/logger'; +import ScaleError from './scale-runners/ScaleError'; import 'source-map-support/register'; export async function scaleUpHandler(event: SQSEvent, context: Context, callback: Callback): Promise { logger.setSettings({ requestId: context.awsRequestId }); logger.debug(JSON.stringify(event)); - try { - for (const e of event.Records) { - await scaleUp(e.eventSource, JSON.parse(e.body)); - } - - callback(null); - } catch (e) { - logger.error(e); - callback('Failed handling SQS event'); + // TODO find the a more elegant way :( + if (event.Records.length != 1) { + logger.warn('Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.'); + return new Promise((resolve) => resolve()); } + + return new Promise((resolve, reject) => { + scaleUp(event.Records[0].eventSource, JSON.parse(event.Records[0].body)) + .then(() => resolve()) + .catch((e: Error) => { + if (e instanceof ScaleError) { + reject(e); + } else { + logger.warn('Ignoring error: ', e); + } + }); + }); } export async function scaleDownHandler(event: ScheduledEvent, context: Context, callback: Callback): Promise { diff --git a/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/modules/runners/lambdas/runners/src/scale-runners/runners.ts index 4453d9cd0b..ba2409467c 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/runners.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/runners.ts @@ -90,15 +90,17 @@ export async function createRunner(runnerParameters: RunnerInputParameters, laun LogFields.print(), ); const ssm = new SSM(); - runInstancesResponse.Instances?.forEach(async (i: EC2.Instance) => { - await ssm - .putParameter({ - Name: runnerParameters.environment + '-' + (i.InstanceId as string), - Value: runnerParameters.runnerServiceConfig, - Type: 'SecureString', - }) - .promise(); - }); + if (runInstancesResponse.Instances != undefined) { + for (let i = 0; i < runInstancesResponse.Instances?.length; i++) { + await ssm + .putParameter({ + Name: runnerParameters.environment + '-' + (runInstancesResponse.Instances[i].InstanceId as string), + Value: runnerParameters.runnerServiceConfig, + Type: 'SecureString', + }) + .promise(); + } + } } function getInstanceParams( diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index bd2dce6f86..610a9b6a4b 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -3,6 +3,7 @@ import { createOctoClient, createGithubAppAuth, createGithubInstallationAuth } f import yn from 'yn'; import { Octokit } from '@octokit/rest'; import { logger as rootLogger, LogFields } from './logger'; +import ScaleError from './ScaleError'; const logger = rootLogger.getChildLogger({ name: 'scale-up' }); @@ -64,9 +65,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const ghAuth = await createGithubInstallationAuth(installationId, ghesApiUrl); const githubInstallationClient = await createOctoClient(ghAuth.token, ghesApiUrl); - const isQueued = await getJobStatus(githubInstallationClient, payload); - // ephemeral runners should be created on every event, will only work with `workflow_job` events. - if (ephemeral || isQueued) { + if (ephemeral || (await getJobStatus(githubInstallationClient, payload))) { const currentRunners = await listEC2Runners({ environment, runnerType, @@ -74,7 +73,6 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage }); logger.info(`Current runners: ${currentRunners.length} of ${maximumRunners}`, LogFields.print()); - // TODO: how to handle the event if the max is reached in case of ephemeral runners if (currentRunners.length < maximumRunners) { logger.info(`Attempting to launch a new runner`, LogFields.print()); // create token @@ -102,6 +100,9 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage }); } else { logger.info('No runner will be created, maximum number of runners reached.', LogFields.print()); + if (ephemeral) { + throw new ScaleError('No runners create: maximum of runners reached.'); + } } } } @@ -146,6 +147,6 @@ export async function createRunnerLoop(runnerParameters: RunnerInputParameters): } } if (launched == false) { - throw Error('All launch templates failed'); + throw new ScaleError('All launch templates failed'); } } diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf index 77057dd400..e217cf4b74 100644 --- a/modules/runners/scale-up.tf +++ b/modules/runners/scale-up.tf @@ -50,6 +50,7 @@ resource "aws_cloudwatch_log_group" "scale_up" { resource "aws_lambda_event_source_mapping" "scale_up" { event_source_arn = var.sqs_build_queue.arn function_name = aws_lambda_function.scale_up.arn + batch_size = 1 } resource "aws_lambda_permission" "scale_runners_lambda" { From 4c982e5136ac4723e6d79beb01ec00c8769554f6 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 1 Dec 2021 15:56:19 +0100 Subject: [PATCH 05/31] Add tests for lamda handler --- .../runners/lambdas/runners/jest.config.js | 2 +- .../lambdas/runners/src/lambda.test.ts | 114 ++++++++++++++++++ modules/runners/lambdas/runners/src/lambda.ts | 2 +- 3 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 modules/runners/lambdas/runners/src/lambda.test.ts diff --git a/modules/runners/lambdas/runners/jest.config.js b/modules/runners/lambdas/runners/jest.config.js index 79ed0ba8aa..8c7a9f17c5 100644 --- a/modules/runners/lambdas/runners/jest.config.js +++ b/modules/runners/lambdas/runners/jest.config.js @@ -2,7 +2,7 @@ module.exports = { preset: 'ts-jest', testEnvironment: 'node', collectCoverage: true, - collectCoverageFrom: ['src/**/*.{ts,js,jsx}'], + collectCoverageFrom: ['src/**/*.{ts,js,jsx}','!src/**/*local*.ts'], coverageThreshold: { global: { branches: 80, diff --git a/modules/runners/lambdas/runners/src/lambda.test.ts b/modules/runners/lambdas/runners/src/lambda.test.ts new file mode 100644 index 0000000000..f617ec7255 --- /dev/null +++ b/modules/runners/lambdas/runners/src/lambda.test.ts @@ -0,0 +1,114 @@ +import { fail } from 'assert'; +import { Context, SQSEvent, SQSRecord } from 'aws-lambda'; +import { mocked } from 'ts-jest/utils'; +import { scaleUpHandler } from './lambda'; +import { ActionRequestMessage, scaleUp } from './scale-runners/scale-up'; +import ScaleError from './scale-runners/ScaleError'; +import { logger } from './scale-runners/logger'; + +const body: ActionRequestMessage = { + eventType: 'workflow_job', + id: 1, + installationId: 1, + repositoryName: 'name', + repositoryOwner: 'owner', +}; + +const sqsRecord: SQSRecord = { + attributes: { + ApproximateFirstReceiveTimestamp: '', + ApproximateReceiveCount: '', + SenderId: '', + SentTimestamp: '', + }, + awsRegion: '', + body: JSON.stringify(body), + eventSource: 'aws:SQS', + eventSourceARN: '', + md5OfBody: '', + messageAttributes: {}, + messageId: '', + receiptHandle: '', +}; + +const sqsEvent: SQSEvent = { + Records: [sqsRecord], +}; + +const context: Context = { + awsRequestId: '1', + callbackWaitsForEmptyEventLoop: false, + functionName: '', + functionVersion: '', + getRemainingTimeInMillis: () => 0, + invokedFunctionArn: '', + logGroupName: '', + logStreamName: '', + memoryLimitInMB: '', + done: () => { + return; + }, + fail: () => { + return; + }, + succeed: () => { + return; + }, +}; + +jest.mock('./scale-runners/scale-up'); +jest.mock('./scale-runners/logger'); + +describe('Test scale up lambda wrapper.', () => { + it('Do not handle multiple record sets.', async () => { + await testInvalidRecords([sqsRecord, sqsRecord]); + }); + + it('Do not handle empty record sets.', async () => { + await testInvalidRecords([]); + }); + + it('Scale without error should resolve.', async () => { + const mock = mocked(scaleUp); + mock.mockImplementation(() => { + return new Promise((resolve, reject) => { + resolve(); + }); + }); + await expect(scaleUpHandler(sqsEvent, context)).resolves; + }); + + it('Non scale should resolve.', async () => { + const error = new Error('some error'); + const mock = mocked(scaleUp); + mock.mockRejectedValue(error); + await expect(scaleUpHandler(sqsEvent, context)).resolves; + }); + + it('Scale should be rejected', async () => { + const error = new ScaleError('some scale error'); + const mock = mocked(scaleUp); + + mock.mockRejectedValue(error); + await expect(scaleUpHandler(sqsEvent, context)).rejects.toThrow(error); + }); +}); + +async function testInvalidRecords(sqsRecords: SQSRecord[]) { + const mock = mocked(scaleUp); + const logWarnSpy = jest.spyOn(logger, 'warn'); + mock.mockImplementation(() => { + return new Promise((resolve) => { + resolve(); + }); + }); + const sqsEventMultipleRecords: SQSEvent = { + Records: sqsRecords, + }; + + await expect(scaleUpHandler(sqsEventMultipleRecords, context)).resolves; + + expect(logWarnSpy).toHaveBeenCalledWith( + 'Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.', + ); +} diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index cd394f6ac2..4222027e77 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -5,7 +5,7 @@ import { logger } from './scale-runners/logger'; import ScaleError from './scale-runners/ScaleError'; import 'source-map-support/register'; -export async function scaleUpHandler(event: SQSEvent, context: Context, callback: Callback): Promise { +export async function scaleUpHandler(event: SQSEvent, context: Context): Promise { logger.setSettings({ requestId: context.awsRequestId }); logger.debug(JSON.stringify(event)); // TODO find the a more elegant way :( From e9f360e81f2aac69bc7700a83ec55ef66ca4e00d Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 1 Dec 2021 16:04:24 +0100 Subject: [PATCH 06/31] Add basic test for ephemeral case --- .../runners/src/scale-runners/scale-up.test.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index 40c601fd41..afac7a943c 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -4,6 +4,7 @@ import { listEC2Runners, createRunner, RunnerInputParameters } from './runners'; import * as ghAuth from './gh-auth'; import nock from 'nock'; import { Octokit } from '@octokit/rest'; +import ScaleError from './ScaleError'; const mockOctokit = { checks: { get: jest.fn() }, @@ -541,5 +542,18 @@ describe('scaleUp with public GH', () => { expect(createRunner).toHaveBeenNthCalledWith(1, expectedRunnerParams, 'lt-1'); expect(createRunner).toHaveBeenNthCalledWith(2, expectedRunnerParams, 'lt-2'); }); + + it('creates a ephemeral runner.', async () => { + process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --ephemeral`; + expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); + }); + + it('Scaling error should cause reject so retry can be triggered.', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '1'; + process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; + await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toBeInstanceOf(ScaleError); + }); }); }); From f26673a8b3e45965e8172d45078c2dc89859278a Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 1 Dec 2021 16:13:32 +0100 Subject: [PATCH 07/31] Add basic test for scale down in lambda wrapper --- .../lambdas/runners/src/lambda.test.ts | 21 +++++++++++++++++++ modules/runners/lambdas/runners/src/lambda.ts | 19 +++++++++-------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/modules/runners/lambdas/runners/src/lambda.test.ts b/modules/runners/lambdas/runners/src/lambda.test.ts index f617ec7255..f1a916372e 100644 --- a/modules/runners/lambdas/runners/src/lambda.test.ts +++ b/modules/runners/lambdas/runners/src/lambda.test.ts @@ -5,6 +5,7 @@ import { scaleUpHandler } from './lambda'; import { ActionRequestMessage, scaleUp } from './scale-runners/scale-up'; import ScaleError from './scale-runners/ScaleError'; import { logger } from './scale-runners/logger'; +import { scaleDown } from './scale-runners/scale-down'; const body: ActionRequestMessage = { eventType: 'workflow_job', @@ -57,6 +58,7 @@ const context: Context = { }; jest.mock('./scale-runners/scale-up'); +jest.mock('./scale-runners/scale-down'); jest.mock('./scale-runners/logger'); describe('Test scale up lambda wrapper.', () => { @@ -112,3 +114,22 @@ async function testInvalidRecords(sqsRecords: SQSRecord[]) { 'Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.', ); } + +describe('Test scale down lambda wrapper.', () => { + it('Scaling down no error.', async () => { + const mock = mocked(scaleDown); + mock.mockImplementation(() => { + return new Promise((resolve) => { + resolve(); + }); + }); + await expect(scaleDown()).resolves; + }); + + it('Scaling down with error.', async () => { + const error = new Error('some error'); + const mock = mocked(scaleDown); + mock.mockRejectedValue(error); + await expect(scaleDown()).resolves; + }); +}); diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index 4222027e77..23a7d7089a 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -8,7 +8,6 @@ import 'source-map-support/register'; export async function scaleUpHandler(event: SQSEvent, context: Context): Promise { logger.setSettings({ requestId: context.awsRequestId }); logger.debug(JSON.stringify(event)); - // TODO find the a more elegant way :( if (event.Records.length != 1) { logger.warn('Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.'); return new Promise((resolve) => resolve()); @@ -27,13 +26,15 @@ export async function scaleUpHandler(event: SQSEvent, context: Context): Promise }); } -export async function scaleDownHandler(event: ScheduledEvent, context: Context, callback: Callback): Promise { +export async function scaleDownHandler(event: ScheduledEvent, context: Context): Promise { logger.setSettings({ requestId: context.awsRequestId }); - try { - await scaleDown(); - callback(null); - } catch (e) { - logger.error(e); - callback('Failed'); - } + + return new Promise((resolve) => { + scaleDown() + .then(() => resolve()) + .catch((e) => { + logger.error(e); + resolve(); + }); + }); } From ad16d803019226555277eb4aad4a785f4e07145d Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:52:22 +0100 Subject: [PATCH 08/31] Ensure check_runs are ignored for ephemeral runners --- .../lambdas/runners/src/scale-runners/scale-up.test.ts | 10 ++++++++++ .../lambdas/runners/src/scale-runners/scale-up.ts | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index afac7a943c..b472911edc 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -543,6 +543,16 @@ describe('scaleUp with public GH', () => { expect(createRunner).toHaveBeenNthCalledWith(2, expectedRunnerParams, 'lt-2'); }); + it('ephemeral runners cannot only run with workflow_job event, others should fail.', async () => { + process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; + await expect( + scaleUpModule.scaleUp('aws:sqs', { + ...TEST_DATA, + eventType: 'check_run', + }), + ).rejects.toBeInstanceOf(Error); + }); + it('creates a ephemeral runner.', async () => { process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 610a9b6a4b..59a66aef11 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -16,6 +16,8 @@ export interface ActionRequestMessage { } export async function scaleUp(eventSource: string, payload: ActionRequestMessage): Promise { + logger.info(`Received ${payload.eventType} from ${payload.repositoryOwner}/${payload.repositoryName}`); + if (eventSource !== 'aws:sqs') throw Error('Cannot handle non-SQS events!'); const enableOrgLevel = yn(process.env.ENABLE_ORGANIZATION_RUNNERS, { default: true }); const maximumRunners = parseInt(process.env.RUNNERS_MAXIMUM_COUNT || '3'); @@ -26,6 +28,13 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); // TODO: handle case event is check_run and ephemeralEnabled = true + if (ephemeralEnabled && payload.eventType != 'workflow_job') { + logger.warn(`${payload.eventType} even is not supported in combination with ephemeral runners.`); + throw Error( + `The workflow_job type ${payload.eventType} is not supported in combination with ephemeral runners.` + + `Please ensure you have enabled workflow_job events.`, + ); + } const ephemeral = ephemeralEnabled && payload.eventType === 'workflow_job'; const runnerType = enableOrgLevel ? 'Org' : 'Repo'; From 5fb7237048f24ce3b71bb7ff2fcbd61b088567cb Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Thu, 2 Dec 2021 15:23:25 +0100 Subject: [PATCH 09/31] limit termination to only the instance itself --- modules/runners/policies/instance-ec2.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/runners/policies/instance-ec2.json b/modules/runners/policies/instance-ec2.json index b2a22cb2e2..1757552dd8 100644 --- a/modules/runners/policies/instance-ec2.json +++ b/modules/runners/policies/instance-ec2.json @@ -7,7 +7,7 @@ "Resource": "*", "Condition": { "StringEquals": { - "ec2:ResourceTag/Application": "github-action-runner" + "aws:ARN": "$${ec2:SourceInstanceARN}" } } } From b6b4fa3aa8e2506e3a6189e37898610282bedf32 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:53:03 +0100 Subject: [PATCH 10/31] fix: add logging context to runner lambda (#1399) * fix(logging): Add context to scale logs Signed-off-by: Nathaniel McAuliffe * Remove testing Signed-off-by: Nathaniel McAuliffe * Remove unnecessary import Signed-off-by: Nathaniel McAuliffe * Moving log fields to end, adjusting format --- .../lambdas/runners/src/scale-runners/scale-up.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 59a66aef11..d2557b0955 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -2,7 +2,7 @@ import { listEC2Runners, createRunner, RunnerInputParameters } from './runners'; import { createOctoClient, createGithubAppAuth, createGithubInstallationAuth } from './gh-auth'; import yn from 'yn'; import { Octokit } from '@octokit/rest'; -import { logger as rootLogger, LogFields } from './logger'; +import { LogFields, logger as rootLogger } from './logger'; import ScaleError from './ScaleError'; const logger = rootLogger.getChildLogger({ name: 'scale-up' }); @@ -27,7 +27,6 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const ghesBaseUrl = process.env.GHES_URL; const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); - // TODO: handle case event is check_run and ephemeralEnabled = true if (ephemeralEnabled && payload.eventType != 'workflow_job') { logger.warn(`${payload.eventType} even is not supported in combination with ephemeral runners.`); throw Error( @@ -36,6 +35,16 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage ); } const ephemeral = ephemeralEnabled && payload.eventType === 'workflow_job'; + const runnerType = enableOrgLevel ? 'Org' : 'Repo'; + const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`; + + LogFields.fields = {}; + LogFields.fields.runnerType = runnerType; + LogFields.fields.runnerOwner = runnerOwner; + LogFields.fields.event = payload.eventType; + LogFields.fields.id = payload.id.toString(); + + logger.info(`Received event`, LogFields.print()); const runnerType = enableOrgLevel ? 'Org' : 'Repo'; const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`; From 14ac0db04dc9318aa3a3858de63ab879c425133b Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 22 Dec 2021 09:14:46 +0100 Subject: [PATCH 11/31] feat: Add hooks for prebuilt images (AMI), including amazon linux packer example (#1444) * Initial creation of runner image * Refactored startup script and added it to the per-boot folder * Make the runner location a variable So we can pass the runner version in at packer build time if we want to update the runner version. * Retrieve external config setting via tags Retrieve the required config via the instance tags so we dont have to pass in and set environment on the instance in an awkward way. * Enable tag based config Give the instance the permission to query its own tags and set the correct tags on the instance. * Add a CI job * Fix the CI build * Fix the formatting * Retain user_data provisioning and remove duplication refactored to make sure user_data continues to work with minimal breaking changes. Use a single set of scripts shared between image and user_data provisioning. * Fix interpolation issues in template file * fix build * Fix formatting * minor tweaks and fixes * Fixes from testing * Enable docker on boot * Add in output of start time for the runner * Scoop up the runner log * Add a powershell build script for windows users * Fix formatting * Use SSM parameters for configuration Its best practice to use SSM parameters for configuration of the runners. In adding this i have also added parameter path based config so its easy to extend in the future. * Make the SSM policy more specific * Update .github/workflows/packer-build.yml Co-authored-by: Niek Palm * Added condition to the describe tags policy * Dont use templatefile on the tags policy Because of the use of ${} in the policy terraform is trying to replace it. * Added an option to turn off userdata scripting * Added/updated documentation * Revert policy as it has no effect on the permissions * Add reference to prebuilt images in the main readme * Add an example of deploying with prebuilt images * Update readme * Use current user as ami_owner * Update example to 5 secs * Updated ami name to include the arch * Fixed log file variable * Added explicit info about required settings to the readme * Change userdata_enabled to enabled_userdata Keep within existing naming convention Co-authored-by: Niek Palm --- README.md | 47 ++++++++++++----------- modules/runners/templates/start-runner.sh | 13 +++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 58af8d4dc5..c423d0b71c 100644 --- a/README.md +++ b/README.md @@ -299,6 +299,7 @@ Examples are located in the [examples](./examples) directory. The following exam - _[Ubuntu](examples/ubuntu/README.md)_: Example usage of creating a runner using Ubuntu AMIs. - _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. - _[Windows](examples/windows/README.md)_: Example usage of creating a runner using Windows as the OS. +- _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. ## Sub modules @@ -340,34 +341,34 @@ In case the setup does not work as intended follow the trace of events: ## Requirements -| Name | Version | -|------|---------| +| Name | Version | +| ------------------------------------------------------------------------- | --------- | | [terraform](#requirement\_terraform) | >= 0.14.1 | -| [aws](#requirement\_aws) | >= 3.38 | +| [aws](#requirement\_aws) | >= 3.38 | ## Providers -| Name | Version | -|------|---------| -| [aws](#provider\_aws) | >= 3.38 | -| [random](#provider\_random) | n/a | +| Name | Version | +| ---------------------------------------------------------- | ------- | +| [aws](#provider\_aws) | >= 3.38 | +| [random](#provider\_random) | n/a | ## Modules -| Name | Source | Version | -|------|--------|---------| -| [runner\_binaries](#module\_runner\_binaries) | ./modules/runner-binaries-syncer | n/a | -| [runners](#module\_runners) | ./modules/runners | n/a | -| [ssm](#module\_ssm) | ./modules/ssm | n/a | -| [webhook](#module\_webhook) | ./modules/webhook | n/a | +| Name | Source | Version | +| ----------------------------------------------------------------------------------- | -------------------------------- | ------- | +| [runner\_binaries](#module\_runner\_binaries) | ./modules/runner-binaries-syncer | n/a | +| [runners](#module\_runners) | ./modules/runners | n/a | +| [ssm](#module\_ssm) | ./modules/ssm | n/a | +| [webhook](#module\_webhook) | ./modules/webhook | n/a | ## Resources -| Name | Type | -|------|------| +| Name | Type | +| ------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | | [aws_resourcegroups_group.resourcegroups_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/resourcegroups_group) | resource | -| [aws_sqs_queue.queued_builds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | -| [random_string.random](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | +| [aws_sqs_queue.queued_builds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [random_string.random](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | ## Inputs @@ -448,12 +449,12 @@ In case the setup does not work as intended follow the trace of events: ## Outputs -| Name | Description | -|------|-------------| -| [binaries\_syncer](#output\_binaries\_syncer) | n/a | -| [runners](#output\_runners) | n/a | -| [ssm\_parameters](#output\_ssm\_parameters) | n/a | -| [webhook](#output\_webhook) | n/a | +| Name | Description | +| ----------------------------------------------------------------------------------- | ----------- | +| [binaries\_syncer](#output\_binaries\_syncer) | n/a | +| [runners](#output\_runners) | n/a | +| [ssm\_parameters](#output\_ssm\_parameters) | n/a | +| [webhook](#output\_webhook) | n/a | ## Contribution diff --git a/modules/runners/templates/start-runner.sh b/modules/runners/templates/start-runner.sh index 6da70d1f9b..f63b77f59e 100644 --- a/modules/runners/templates/start-runner.sh +++ b/modules/runners/templates/start-runner.sh @@ -29,12 +29,11 @@ echo "Retrieved /$environment/runner/enable-cloudwatch parameter - ($enable_clou agent_mode=$(echo "$parameters" | jq --arg environment "$environment" -r '.[] | select(.Name == "/\($environment)/runner/agent-mode") | .Value') echo "Retrieved /$environment/runner/agent-mode parameter - ($agent_mode)" -if [[ -n "$enable_cloudwatch_agent" ]]; then - echo "Cloudwatch is enabled" +if [[ -n "$enable_cloudwatch_agent" ]]; then + echo "Cloudwatch is enabled" amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c "ssm:$environment-cloudwatch_agent_config_runner" fi - ## Configure the runner echo "Get GH Runner config from AWS SSM" @@ -66,18 +65,18 @@ sudo --preserve-env=RUNNER_ALLOW_RUNASROOT -u "$run_as" -- ./config.sh --unatten echo "Starting runner after $(awk '{print int($1/3600)":"int(($1%3600)/60)":"int($1%60)}' /proc/uptime)" echo "Starting the runner as user $run_as" -if [[ $agent_mode = "ephemeral" ]]; then +if [[ $agent_mode = "ephemeral" ]]; then echo "Starting the runner in ephemeral mode" sudo --preserve-env=RUNNER_ALLOW_RUNASROOT -u "$run_as" -- ./run.sh echo "Runner has finished" - + echo "Stopping cloudwatch service" service awslogsd stop echo "Terminating instance" aws ec2 terminate-instances --instance-ids "$instance_id" --region "$region" -else +else echo "Installing the runner as a service" ./svc.sh install "$run_as" echo "Starting the runner in persistent mode" ./svc.sh start -fi \ No newline at end of file +fi From 4687395f7afae5f2129e03c347aa21afac4b003f Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:44:04 +0100 Subject: [PATCH 12/31] add option ephemeral runners --- examples/default/main.tf | 5 +---- .../runners/lambdas/runners/src/scale-runners/scale-up.ts | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/default/main.tf b/examples/default/main.tf index a8c40c47ee..77f46a56dd 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -61,10 +61,7 @@ module "runners" { instance_types = ["m5.large", "c5.large"] # override delay of events in seconds - delay_webhook_event = 10 - //job_queue_retention_in_seconds = 600 - //job_queue_retention_in_seconds = 60 - runners_maximum_count = 1 + delay_webhook_event = 0 # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index d2557b0955..7ef1dd1c79 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -91,6 +91,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage }); logger.info(`Current runners: ${currentRunners.length} of ${maximumRunners}`, LogFields.print()); + // TODO: how to handle the event if the max is reached in case of ephemeral runners if (currentRunners.length < maximumRunners) { logger.info(`Attempting to launch a new runner`, LogFields.print()); // create token From 0b6a8e75684169b9184b529ccb7e593f88d84649 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:44:42 +0100 Subject: [PATCH 13/31] Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors Add retry mechanisme for scaling errors --- examples/default/main.tf | 5 ++++- .../runners/lambdas/runners/src/scale-runners/scale-up.ts | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/default/main.tf b/examples/default/main.tf index 77f46a56dd..a8c40c47ee 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -61,7 +61,10 @@ module "runners" { instance_types = ["m5.large", "c5.large"] # override delay of events in seconds - delay_webhook_event = 0 + delay_webhook_event = 10 + //job_queue_retention_in_seconds = 600 + //job_queue_retention_in_seconds = 60 + runners_maximum_count = 1 # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 7ef1dd1c79..d2557b0955 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -91,7 +91,6 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage }); logger.info(`Current runners: ${currentRunners.length} of ${maximumRunners}`, LogFields.print()); - // TODO: how to handle the event if the max is reached in case of ephemeral runners if (currentRunners.length < maximumRunners) { logger.info(`Attempting to launch a new runner`, LogFields.print()); // create token From 99399a18aebbdee73c0b495f370ee26ccec77aa6 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 22 Dec 2021 09:17:18 +0100 Subject: [PATCH 14/31] add dead letter queue, and refactor --- README.md | 52 ++++++++------- examples/default/main.tf | 17 ++--- examples/ephemeral/main.tf | 47 ++++++++------ examples/ephemeral/outputs.tf | 2 +- examples/ephemeral/variables.tf | 3 - main.tf | 20 ++++-- modules/runners/runner-config.tf | 7 +- modules/runners/templates/start-runner.sh | 2 +- .../webhook/lambdas/webhook/jest.config.js | 10 +++ .../lambdas/webhook/src/sqs/index.test.ts | 65 +++++++++++++++++++ .../webhook/lambdas/webhook/src/sqs/index.ts | 32 +++++---- .../webhook/src/webhook/handler.test.ts | 21 ++++++ .../lambdas/webhook/src/webhook/handler.ts | 18 ++--- outputs.tf | 5 ++ variables.tf | 26 +++++++- 15 files changed, 238 insertions(+), 89 deletions(-) create mode 100644 modules/webhook/lambdas/webhook/src/sqs/index.test.ts diff --git a/README.md b/README.md index c423d0b71c..1c772a3eab 100644 --- a/README.md +++ b/README.md @@ -341,34 +341,35 @@ In case the setup does not work as intended follow the trace of events: ## Requirements -| Name | Version | -| ------------------------------------------------------------------------- | --------- | +| Name | Version | +|------|---------| | [terraform](#requirement\_terraform) | >= 0.14.1 | -| [aws](#requirement\_aws) | >= 3.38 | +| [aws](#requirement\_aws) | >= 3.38 | ## Providers -| Name | Version | -| ---------------------------------------------------------- | ------- | -| [aws](#provider\_aws) | >= 3.38 | -| [random](#provider\_random) | n/a | +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | >= 3.38 | +| [random](#provider\_random) | n/a | ## Modules -| Name | Source | Version | -| ----------------------------------------------------------------------------------- | -------------------------------- | ------- | -| [runner\_binaries](#module\_runner\_binaries) | ./modules/runner-binaries-syncer | n/a | -| [runners](#module\_runners) | ./modules/runners | n/a | -| [ssm](#module\_ssm) | ./modules/ssm | n/a | -| [webhook](#module\_webhook) | ./modules/webhook | n/a | +| Name | Source | Version | +|------|--------|---------| +| [runner\_binaries](#module\_runner\_binaries) | ./modules/runner-binaries-syncer | n/a | +| [runners](#module\_runners) | ./modules/runners | n/a | +| [ssm](#module\_ssm) | ./modules/ssm | n/a | +| [webhook](#module\_webhook) | ./modules/webhook | n/a | ## Resources -| Name | Type | -| ------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | +| Name | Type | +|------|------| | [aws_resourcegroups_group.resourcegroups_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/resourcegroups_group) | resource | -| [aws_sqs_queue.queued_builds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | -| [random_string.random](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | +| [aws_sqs_queue.queued_builds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [aws_sqs_queue.queued_builds_dlq](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [random_string.random](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | ## Inputs @@ -388,6 +389,7 @@ In case the setup does not work as intended follow the trace of events: | [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | `false` | no | | [enabled\_userdata](#input\_enabled\_userdata) | Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI | `bool` | `true` | no | | [environment](#input\_environment) | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | +| [fifo\_build\_queue](#input\_fifo\_build\_queue) | Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners. | `bool` | `false` | no | | [ghes\_ssl\_verify](#input\_ghes\_ssl\_verify) | GitHub Enterprise SSL verification. Set to 'false' when custom certificate (chains) is used for GitHub Enterprise Server (insecure). | `bool` | `true` | no | | [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL. Example: https://github.internal.co - DO NOT SET IF USING PUBLIC GITHUB | `string` | `null` | no | | [github\_app](#input\_github\_app) | GitHub app parameters, see your github app. Ensure the key is the base64-encoded `.pem` file (the output of `base64 app.private-key.pem`, not the content of `private-key.pem`). |
object({
key_base64 = string
id = string
webhook_secret = string
})
| n/a | yes | @@ -407,6 +409,7 @@ In case the setup does not work as intended follow the trace of events: | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | | [market\_options](#input\_market\_options) | Market options for the action runner instances. Setting the value to `null` let the scaler create on-demand instances instead of spot instances. | `string` | `"spot"` | no | | [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated if not busy. | `number` | `null` | no | +| [redrive\_build\_queue](#input\_redrive\_build\_queue) | Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries, and `deadLetterTargetArn` to null for letting the module create a queue. Or otherwise provide you own queue by setting an ARN.` |
object({
enabled = bool
maxReceiveCount = number
deadLetterTargetArn = string
})
|
{
"deadLetterTargetArn": null,
"enabled": false,
"maxReceiveCount": null
}
| no | | [repository\_white\_list](#input\_repository\_white\_list) | List of repositories allowed to use the github app | `list(string)` | `[]` | no | | [role\_path](#input\_role\_path) | The path that will be added to role path for created roles, if not set the environment name will be used. | `string` | `null` | no | | [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created roles. | `string` | `null` | no | @@ -430,7 +433,7 @@ In case the setup does not work as intended follow the trace of events: | [runners\_lambda\_zip](#input\_runners\_lambda\_zip) | File location of the lambda zip file for scaling runners. | `string` | `null` | no | | [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. | `number` | `3` | no | | [runners\_scale\_down\_lambda\_timeout](#input\_runners\_scale\_down\_lambda\_timeout) | Time out for the scale down lambda in seconds. | `number` | `60` | no | -| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `180` | no | +| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no | | [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [subnet\_ids](#input\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | n/a | yes | @@ -449,12 +452,13 @@ In case the setup does not work as intended follow the trace of events: ## Outputs -| Name | Description | -| ----------------------------------------------------------------------------------- | ----------- | -| [binaries\_syncer](#output\_binaries\_syncer) | n/a | -| [runners](#output\_runners) | n/a | -| [ssm\_parameters](#output\_ssm\_parameters) | n/a | -| [webhook](#output\_webhook) | n/a | +| Name | Description | +|------|-------------| +| [binaries\_syncer](#output\_binaries\_syncer) | n/a | +| [name](#output\_name) | n/a | +| [runners](#output\_runners) | n/a | +| [ssm\_parameters](#output\_ssm\_parameters) | n/a | +| [webhook](#output\_webhook) | n/a | ## Contribution diff --git a/examples/default/main.tf b/examples/default/main.tf index a8c40c47ee..08d370056e 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -30,10 +30,12 @@ module "runners" { webhook_secret = random_id.random.hex } - # webhook_lambda_zip = "lambdas-download/webhook.zip" - # runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip" - # runners_lambda_zip = "lambdas-download/runners.zip" - enable_organization_runners = true + # Grab zip files via lambda_download + webhook_lambda_zip = "lambdas-download/webhook.zip" + runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip" + runners_lambda_zip = "lambdas-download/runners.zip" + + enable_organization_runners = false runner_extra_labels = "default,example" # enable access to the runners via SSM @@ -61,14 +63,9 @@ module "runners" { instance_types = ["m5.large", "c5.large"] # override delay of events in seconds - delay_webhook_event = 10 - //job_queue_retention_in_seconds = 600 - //job_queue_retention_in_seconds = 60 + delay_webhook_event = 10 runners_maximum_count = 1 # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" - - enable_ephemeral_runners = true - disable_check_wokflow_job_labels = true } diff --git a/examples/ephemeral/main.tf b/examples/ephemeral/main.tf index 765b15b4ef..b394034a3e 100644 --- a/examples/ephemeral/main.tf +++ b/examples/ephemeral/main.tf @@ -1,16 +1,13 @@ locals { - environment = "default" + environment = "ephemeraal" aws_region = "eu-west-1" } -resource "random_password" "random" { - length = 28 +resource "random_id" "random" { + byte_length = 20 } - -################################################################################ -### Hybrid acccount -################################################################################ +data "aws_caller_identity" "current" {} module "runners" { source = "../../" @@ -27,36 +24,48 @@ module "runners" { github_app = { key_base64 = var.github_app_key_base64 id = var.github_app_id - webhook_secret = random_password.random.result + webhook_secret = random_id.random.hex } - # webhook_lambda_zip = "lambdas-download/webhook.zip" - # runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip" - # runners_lambda_zip = "lambdas-download/runners.zip" + # Grab the lambda packages from local directory. Must run /.ci/build.sh first + webhook_lambda_zip = "../../lambda_output/webhook.zip" + runner_binaries_syncer_lambda_zip = "../../lambda_output/runner-binaries-syncer.zip" + runners_lambda_zip = "../../lambda_output/runners.zip" + enable_organization_runners = true runner_extra_labels = "default,example" # enable access to the runners via SSM enable_ssm_on_runners = true - # Uncommet idle config to have idle runners from 9 to 5 in time zone Amsterdam - # idle_config = [{ - # cron = "* * 9-17 * * *" - # timeZone = "Europe/Amsterdam" - # idleCount = 1 - # }] - # Let the module manage the service linked role # create_service_linked_role_spot = true - instance_types = ["m5d.large"] + instance_types = ["m5.large", "c5.large"] # override delay of events in seconds delay_webhook_event = 0 + # Ensure you set the number not too low, each build require a new instance + runners_maximum_count = 20 + # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" enable_ephemeral_runners = true + # configure your pre-built AMI + # enabled_userdata = false + # ami_filter = { name = ["github-runner-amzn2-x86_64-2021*"] } + # ami_owners = [data.aws_caller_identity.current.account_id] + + # Enable logging + # log_level = "debug" + + # Setup a dead letter queue, by default scale up lambda will kepp retrying to process event in case of scaling error. + # redrive_policy_build_queue = { + # enabled = true + # maxReceiveCount = 50 # 50 retries every 30 seconds => 25 minutes + # deadLetterTargetArn = null + # } } diff --git a/examples/ephemeral/outputs.tf b/examples/ephemeral/outputs.tf index d6886efe36..c50214f566 100644 --- a/examples/ephemeral/outputs.tf +++ b/examples/ephemeral/outputs.tf @@ -10,6 +10,6 @@ output "webhook_endpoint" { output "webhook_secret" { sensitive = true - value = random_password.random.result + value = random_id.random.hex } diff --git a/examples/ephemeral/variables.tf b/examples/ephemeral/variables.tf index e43d3efccf..5701717dcf 100644 --- a/examples/ephemeral/variables.tf +++ b/examples/ephemeral/variables.tf @@ -7,6 +7,3 @@ variable "github_app_client_id" {} variable "github_app_client_secret" {} -variable "owner" { - -} diff --git a/main.tf b/main.tf index 595e06cde9..2713d6f958 100644 --- a/main.tf +++ b/main.tf @@ -18,14 +18,26 @@ resource "random_string" "random" { upper = false } +locals { + build_queue_dead_letter_arn = var.redrive_build_queue.enabled && var.redrive_build_queue.deadLetterTargetArn == null ? aws_sqs_queue.queued_builds_dlq[0].arn : var.redrive_build_queue.deadLetterTargetArn + redrive_policy = var.redrive_build_queue.enabled ? { deadLetterTargetArn = local.build_queue_dead_letter_arn, maxReceiveCount = var.redrive_build_queue.maxReceiveCount } : null +} resource "aws_sqs_queue" "queued_builds" { - name = "${var.environment}-queued-builds.fifo" + name = "${var.environment}-queued-builds${var.fifo_build_queue ? ".fifo" : ""}" delay_seconds = var.delay_webhook_event visibility_timeout_seconds = var.runners_scale_up_lambda_timeout message_retention_seconds = var.job_queue_retention_in_seconds - fifo_queue = true - receive_wait_time_seconds = 10 - content_based_deduplication = true + fifo_queue = var.fifo_build_queue + receive_wait_time_seconds = 0 + content_based_deduplication = var.fifo_build_queue + redrive_policy = local.redrive_policy == null ? null : jsonencode(local.redrive_policy) + + tags = var.tags +} + +resource "aws_sqs_queue" "queued_builds_dlq" { + count = var.redrive_build_queue.enabled && var.redrive_build_queue.deadLetterTargetArn == null ? 1 : 0 + name = "${var.environment}-queued-builds_dead_letter" tags = var.tags } diff --git a/modules/runners/runner-config.tf b/modules/runners/runner-config.tf index eb6370e58f..83ec7929cd 100644 --- a/modules/runners/runner-config.tf +++ b/modules/runners/runner-config.tf @@ -6,10 +6,9 @@ resource "aws_ssm_parameter" "runner_config_run_as" { } resource "aws_ssm_parameter" "runner_agent_mode" { - name = "/${var.environment}/runner/agent-mode" - type = "String" - # TODO: Update this to allow for ephemeral runners - value = "persistent" + name = "/${var.environment}/runner/agent-mode" + type = "String" + value = var.enable_ephemeral_runners ? "ephemeral" : "persistent" tags = local.tags } diff --git a/modules/runners/templates/start-runner.sh b/modules/runners/templates/start-runner.sh index f63b77f59e..3cedc0862b 100644 --- a/modules/runners/templates/start-runner.sh +++ b/modules/runners/templates/start-runner.sh @@ -71,7 +71,7 @@ if [[ $agent_mode = "ephemeral" ]]; then echo "Runner has finished" echo "Stopping cloudwatch service" - service awslogsd stop + systemctl stop amazon-cloudwatch-agent.service echo "Terminating instance" aws ec2 terminate-instances --instance-ids "$instance_id" --region "$region" else diff --git a/modules/webhook/lambdas/webhook/jest.config.js b/modules/webhook/lambdas/webhook/jest.config.js index 4a5b465ecb..02a6524ce9 100644 --- a/modules/webhook/lambdas/webhook/jest.config.js +++ b/modules/webhook/lambdas/webhook/jest.config.js @@ -1,4 +1,14 @@ module.exports = { preset: 'ts-jest', testEnvironment: 'node', + collectCoverage: true, + collectCoverageFrom: ['src/**/*.{ts,js,jsx}', '!src/**/*local*.ts'], + coverageThreshold: { + global: { + branches: 85, + functions: 85, + lines: 85, + statements: 85 + } + } }; diff --git a/modules/webhook/lambdas/webhook/src/sqs/index.test.ts b/modules/webhook/lambdas/webhook/src/sqs/index.test.ts new file mode 100644 index 0000000000..d44dfaca4d --- /dev/null +++ b/modules/webhook/lambdas/webhook/src/sqs/index.test.ts @@ -0,0 +1,65 @@ +import { SQS } from 'aws-sdk'; +import { sendActionRequest, ActionRequestMessage } from '.'; + +const mockSQS = { + sendMessage: jest.fn(() => { + { + return { promise: jest.fn() }; + } + }), +}; +jest.mock('aws-sdk', () => ({ + SQS: jest.fn().mockImplementation(() => mockSQS), +})); + +describe('Test sending message to SQS.', () => { + const message: ActionRequestMessage = { + eventType: 'type', + id: 0, + installationId: 0, + repositoryName: 'test', + repositoryOwner: 'owner', + }; + const sqsMessage: SQS.Types.SendMessageRequest = { + QueueUrl: 'https://sqs.eu-west-1.amazonaws.com/123456789/queued-builds', + MessageBody: JSON.stringify(message), + }; + + it('no fifo queue, based on defaults', async () => { + // Arrange + process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; + + // Act + const result = await sendActionRequest(message); + + // Assert + expect(mockSQS.sendMessage).toBeCalledWith(sqsMessage); + expect(result).resolves; + }); + + it('no fifo queue', async () => { + // Arrange + process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; + process.env.USE_FIFO_QUEUE = 'false'; + + // Act + const result = await sendActionRequest(message); + + // Assert + expect(mockSQS.sendMessage).toBeCalledWith(sqsMessage); + expect(result).resolves; + }); + + it('use a fifo queue', async () => { + // Arrange + process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; + process.env.USE_FIFO_QUEUE = 'true'; + + // Act + const result = await sendActionRequest(message); + + // Assert + expect(mockSQS.sendMessage).toBeCalledWith({ ...sqsMessage, MessageGroupId: String(message.id) }); + expect(result).resolves; + }); +}); diff --git a/modules/webhook/lambdas/webhook/src/sqs/index.ts b/modules/webhook/lambdas/webhook/src/sqs/index.ts index 1a6e75e808..1e0467caa8 100644 --- a/modules/webhook/lambdas/webhook/src/sqs/index.ts +++ b/modules/webhook/lambdas/webhook/src/sqs/index.ts @@ -1,10 +1,5 @@ -import AWS, { SQS } from 'aws-sdk'; - -AWS.config.update({ - region: process.env.AWS_REGION, -}); - -const sqs = new SQS(); +import { SQS } from 'aws-sdk'; +import { logger as logger } from '../webhook/logger'; export interface ActionRequestMessage { id: number; @@ -15,11 +10,20 @@ export interface ActionRequestMessage { } export const sendActionRequest = async (message: ActionRequestMessage): Promise => { - await sqs - .sendMessage({ - QueueUrl: String(process.env.SQS_URL_WEBHOOK), - MessageBody: JSON.stringify(message), - MessageGroupId: String(message.id), - }) - .promise(); + const sqs = new SQS({ region: process.env.AWS_REGION }); + + const useFifoQueueEnv = process.env.USE_FIFO_QUEUE || 'false'; + const useFifoQueue = JSON.parse(useFifoQueueEnv) as boolean; + + const sqsMessage: SQS.Types.SendMessageRequest = { + QueueUrl: String(process.env.SQS_URL_WEBHOOK), + MessageBody: JSON.stringify(message), + }; + + logger.debug(`sending message to SQS: ${JSON.stringify(sqsMessage)}`); + if (useFifoQueue) { + sqsMessage.MessageGroupId = String(message.id); + } + + await sqs.sendMessage(sqsMessage).promise(); }; diff --git a/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts b/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts index 72e84219ec..76bbf737a3 100644 --- a/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts +++ b/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts @@ -98,6 +98,17 @@ describe('handler', () => { expect(sendActionRequest).not.toBeCalled(); }); + it('handles workflow_job events without installation id', async () => { + const event = JSON.stringify({ ...workflowjob_event, installation: null }); + process.env.REPOSITORY_WHITE_LIST = '["philips-labs/terraform-aws-github-runner"]'; + const resp = await handle( + { 'X-Hub-Signature': await webhooks.sign(event), 'X-GitHub-Event': 'workflow_job' }, + event, + ); + expect(resp.statusCode).toBe(201); + expect(sendActionRequest).toBeCalled(); + }); + it('handles workflow_job events from whitelisted repositories', async () => { const event = JSON.stringify(workflowjob_event); process.env.REPOSITORY_WHITE_LIST = '["philips-labs/terraform-aws-github-runner"]'; @@ -264,5 +275,15 @@ describe('handler', () => { expect(resp.statusCode).toBe(201); expect(sendActionRequest).toBeCalled(); }); + + it('handles check_run events with no installation id.', async () => { + const event = JSON.stringify({ ...checkrun_event, installation: { id: null } }); + const resp = await handle( + { 'X-Hub-Signature': await webhooks.sign(event), 'X-GitHub-Event': 'check_run' }, + event, + ); + expect(resp.statusCode).toBe(201); + expect(sendActionRequest).toBeCalled(); + }); }); }); diff --git a/modules/webhook/lambdas/webhook/src/webhook/handler.ts b/modules/webhook/lambdas/webhook/src/webhook/handler.ts index 11b9b1ec76..7e56480e2b 100644 --- a/modules/webhook/lambdas/webhook/src/webhook/handler.ts +++ b/modules/webhook/lambdas/webhook/src/webhook/handler.ts @@ -101,10 +101,7 @@ async function handleWorkflowJob(body: WorkflowJobEvent, githubEvent: string): P }; } - let installationId = body.installation?.id; - if (installationId == null) { - installationId = 0; - } + const installationId = getInstallationId(body); if (body.action === 'queued') { await sendActionRequest({ id: body.workflow_job.id, @@ -119,10 +116,7 @@ async function handleWorkflowJob(body: WorkflowJobEvent, githubEvent: string): P } async function handleCheckRun(body: CheckRunEvent, githubEvent: string): Promise { - let installationId = body.installation?.id; - if (installationId == null) { - installationId = 0; - } + const installationId = getInstallationId(body); if (body.action === 'created' && body.check_run.status === 'queued') { await sendActionRequest({ id: body.check_run.id, @@ -136,6 +130,14 @@ async function handleCheckRun(body: CheckRunEvent, githubEvent: string): Promise return { statusCode: 201 }; } +function getInstallationId(body: WorkflowJobEvent | CheckRunEvent) { + let installationId = body.installation?.id; + if (installationId == null) { + installationId = 0; + } + return installationId; +} + function isRepoNotAllowed(repo_full_name: string): boolean { const repositoryWhiteListEnv = process.env.REPOSITORY_WHITE_LIST || '[]'; const repositoryWhiteList = JSON.parse(repositoryWhiteListEnv) as Array; diff --git a/outputs.tf b/outputs.tf index 1c2bef291a..f4d4408ae0 100644 --- a/outputs.tf +++ b/outputs.tf @@ -32,3 +32,8 @@ output "webhook" { output "ssm_parameters" { value = module.ssm.parameters } + + +output "name" { + value = aws_sqs_queue.queued_builds_dlq.* +} diff --git a/variables.tf b/variables.tf index df98e1c79d..3f615ad37f 100644 --- a/variables.tf +++ b/variables.tf @@ -96,7 +96,7 @@ variable "runners_lambda_zip" { variable "runners_scale_up_lambda_timeout" { description = "Time out for the scale up lambda in seconds." type = number - default = 180 + default = 30 } variable "runners_scale_down_lambda_timeout" { @@ -478,3 +478,27 @@ variable "lambda_principals" { })) default = [] } + +variable "fifo_build_queue" { + description = "Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners." + type = bool + default = false +} + +variable "redrive_build_queue" { + description = "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries, and `deadLetterTargetArn` to null for letting the module create a queue. Or otherwise provide you own queue by setting an ARN." + type = object({ + enabled = bool + maxReceiveCount = number + deadLetterTargetArn = string + }) + default = { + enabled = false + maxReceiveCount = null + deadLetterTargetArn = null + } + validation { + condition = var.redrive_build_queue.enabled && var.redrive_build_queue.maxReceiveCount != null || !var.redrive_build_queue.enabled + error_message = "Ensure you have set the maxReceiveCount when enabled." + } +} From 3f46be84de42f8d88eb45fd2bac6b32f8ffddc6f Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:28:37 +0100 Subject: [PATCH 15/31] cleanup --- examples/ephemeral/variables.tf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/ephemeral/variables.tf b/examples/ephemeral/variables.tf index 5701717dcf..1f4576b1b5 100644 --- a/examples/ephemeral/variables.tf +++ b/examples/ephemeral/variables.tf @@ -3,7 +3,3 @@ variable "github_app_key_base64" {} variable "github_app_id" {} -variable "github_app_client_id" {} - -variable "github_app_client_secret" {} - From 4eb82ac25b67b43a95352701c67f3b4123f728d2 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 17:30:18 +0100 Subject: [PATCH 16/31] cleanup --- .ci/build-yarn.sh | 2 + examples/default/main.tf | 3 + main.tf | 1 + modules/webhook/README.md | 81 ++++++++++--------- .../lambdas/webhook/src/sqs/index.test.ts | 4 +- .../webhook/lambdas/webhook/src/sqs/index.ts | 2 +- modules/webhook/variables.tf | 6 ++ modules/webhook/webhook.tf | 1 + 8 files changed, 58 insertions(+), 42 deletions(-) diff --git a/.ci/build-yarn.sh b/.ci/build-yarn.sh index b0f847113d..1283d9ea8b 100755 --- a/.ci/build-yarn.sh +++ b/.ci/build-yarn.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# Build all the lambda's, output on the default place (inside the lambda module) + lambdaSrcDirs=("modules/runner-binaries-syncer/lambdas/runner-binaries-syncer" "modules/runners/lambdas/runners" "modules/webhook/lambdas/webhook") repoRoot=$(dirname $(dirname $(realpath ${BASH_SOURCE[0]}))) diff --git a/examples/default/main.tf b/examples/default/main.tf index 08d370056e..f38be98f6d 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -66,6 +66,9 @@ module "runners" { delay_webhook_event = 10 runners_maximum_count = 1 + # set up a fifo queue to remain order + fifo_build_queue = true + # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" } diff --git a/main.tf b/main.tf index 2713d6f958..c2d5cab8e9 100644 --- a/main.tf +++ b/main.tf @@ -60,6 +60,7 @@ module "webhook" { kms_key_arn = var.kms_key_arn sqs_build_queue = aws_sqs_queue.queued_builds + sqs_build_queue_fifo = var.fifo_build_queue github_app_webhook_secret_arn = module.ssm.parameters.github_app_webhook_secret.arn lambda_s3_bucket = var.lambda_s3_bucket diff --git a/modules/webhook/README.md b/modules/webhook/README.md index ad9beca0b1..2430972b5e 100644 --- a/modules/webhook/README.md +++ b/modules/webhook/README.md @@ -38,66 +38,69 @@ yarn run dist | Name | Version | |------|---------| -| terraform | >= 0.14.1 | -| aws | >= 3.38 | +| [terraform](#requirement\_terraform) | >= 0.14.1 | +| [aws](#requirement\_aws) | >= 3.38 | ## Providers | Name | Version | |------|---------| -| aws | >= 3.38 | +| [aws](#provider\_aws) | >= 3.38 | ## Modules -No Modules. +No modules. ## Resources -| Name | -|------| -| [aws_apigatewayv2_api](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_api) | -| [aws_apigatewayv2_integration](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_integration) | -| [aws_apigatewayv2_route](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_route) | -| [aws_apigatewayv2_stage](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_stage) | -| [aws_cloudwatch_log_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | -| [aws_iam_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | -| [aws_iam_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | -| [aws_iam_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | -| [aws_lambda_function](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | -| [aws_lambda_permission](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | +| Name | Type | +|------|------| +| [aws_apigatewayv2_api.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_api) | resource | +| [aws_apigatewayv2_integration.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_integration) | resource | +| [aws_apigatewayv2_route.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_route) | resource | +| [aws_apigatewayv2_stage.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_stage) | resource | +| [aws_cloudwatch_log_group.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_iam_role.webhook_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy.webhook_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.webhook_sqs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.webhook_ssm](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_lambda_function.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_permission.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_iam_policy_document.lambda_assume_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| aws\_region | AWS region. | `string` | n/a | yes | -| disable\_check\_wokflow\_job\_labels | Disable the the check of workflow labels. | `bool` | `false` | no | -| environment | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | -| github\_app\_webhook\_secret\_arn | n/a | `string` | n/a | yes | -| kms\_key\_arn | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | -| lambda\_s3\_bucket | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `any` | `null` | no | -| lambda\_timeout | Time out of the lambda in seconds. | `number` | `10` | no | -| lambda\_zip | File location of the lambda zip file. | `string` | `null` | no | -| log\_level | Logging level for lambda logging. Valid values are 'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no | -| log\_type | Logging format for lambda logging. Valid values are 'json', 'pretty', 'hidden'. | `string` | `"pretty"` | no | -| logging\_retention\_in\_days | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `7` | no | -| repository\_white\_list | List of repositories allowed to use the github app | `list(string)` | `[]` | no | -| role\_path | The path that will be added to the role; if not set, the environment name will be used. | `string` | `null` | no | -| role\_permissions\_boundary | Permissions boundary that will be added to the created role for the lambda. | `string` | `null` | no | -| runner\_extra\_labels | Extra labels for the runners (GitHub). Separate each label by a comma | `string` | `""` | no | -| sqs\_build\_queue | SQS queue to publish accepted build events. |
object({
id = string
arn = string
})
| n/a | yes | -| tags | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no | -| webhook\_lambda\_s3\_key | S3 key for webhook lambda function. Required if using S3 bucket to specify lambdas. | `any` | `null` | no | -| webhook\_lambda\_s3\_object\_version | S3 object version for webhook lambda function. Useful if S3 versioning is enabled on source bucket. | `any` | `null` | no | +| [aws\_region](#input\_aws\_region) | AWS region. | `string` | n/a | yes | +| [disable\_check\_wokflow\_job\_labels](#input\_disable\_check\_wokflow\_job\_labels) | Disable the the check of workflow labels. | `bool` | `false` | no | +| [environment](#input\_environment) | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | +| [github\_app\_webhook\_secret\_arn](#input\_github\_app\_webhook\_secret\_arn) | n/a | `string` | n/a | yes | +| [kms\_key\_arn](#input\_kms\_key\_arn) | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | +| [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `any` | `null` | no | +| [lambda\_timeout](#input\_lambda\_timeout) | Time out of the lambda in seconds. | `number` | `10` | no | +| [lambda\_zip](#input\_lambda\_zip) | File location of the lambda zip file. | `string` | `null` | no | +| [log\_level](#input\_log\_level) | Logging level for lambda logging. Valid values are 'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no | +| [log\_type](#input\_log\_type) | Logging format for lambda logging. Valid values are 'json', 'pretty', 'hidden'. | `string` | `"pretty"` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `7` | no | +| [repository\_white\_list](#input\_repository\_white\_list) | List of repositories allowed to use the github app | `list(string)` | `[]` | no | +| [role\_path](#input\_role\_path) | The path that will be added to the role; if not set, the environment name will be used. | `string` | `null` | no | +| [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created role for the lambda. | `string` | `null` | no | +| [runner\_extra\_labels](#input\_runner\_extra\_labels) | Extra labels for the runners (GitHub). Separate each label by a comma | `string` | `""` | no | +| [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to publish accepted build events. |
object({
id = string
arn = string
})
| n/a | yes | +| [sqs\_build\_queue\_fifo](#input\_sqs\_build\_queue\_fifo) | Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners. | `bool` | `false` | no | +| [tags](#input\_tags) | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no | +| [webhook\_lambda\_s3\_key](#input\_webhook\_lambda\_s3\_key) | S3 key for webhook lambda function. Required if using S3 bucket to specify lambdas. | `any` | `null` | no | +| [webhook\_lambda\_s3\_object\_version](#input\_webhook\_lambda\_s3\_object\_version) | S3 object version for webhook lambda function. Useful if S3 versioning is enabled on source bucket. | `any` | `null` | no | ## Outputs | Name | Description | |------|-------------| -| endpoint\_relative\_path | n/a | -| gateway | n/a | -| lambda | n/a | -| role | n/a | +| [endpoint\_relative\_path](#output\_endpoint\_relative\_path) | n/a | +| [gateway](#output\_gateway) | n/a | +| [lambda](#output\_lambda) | n/a | +| [role](#output\_role) | n/a | ## Philips Forest diff --git a/modules/webhook/lambdas/webhook/src/sqs/index.test.ts b/modules/webhook/lambdas/webhook/src/sqs/index.test.ts index d44dfaca4d..de8570157f 100644 --- a/modules/webhook/lambdas/webhook/src/sqs/index.test.ts +++ b/modules/webhook/lambdas/webhook/src/sqs/index.test.ts @@ -40,7 +40,7 @@ describe('Test sending message to SQS.', () => { it('no fifo queue', async () => { // Arrange process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; - process.env.USE_FIFO_QUEUE = 'false'; + process.env.SQS_IS_FIFO = 'false'; // Act const result = await sendActionRequest(message); @@ -53,7 +53,7 @@ describe('Test sending message to SQS.', () => { it('use a fifo queue', async () => { // Arrange process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; - process.env.USE_FIFO_QUEUE = 'true'; + process.env.SQS_IS_FIFO = 'true'; // Act const result = await sendActionRequest(message); diff --git a/modules/webhook/lambdas/webhook/src/sqs/index.ts b/modules/webhook/lambdas/webhook/src/sqs/index.ts index 1e0467caa8..2217718f7d 100644 --- a/modules/webhook/lambdas/webhook/src/sqs/index.ts +++ b/modules/webhook/lambdas/webhook/src/sqs/index.ts @@ -12,7 +12,7 @@ export interface ActionRequestMessage { export const sendActionRequest = async (message: ActionRequestMessage): Promise => { const sqs = new SQS({ region: process.env.AWS_REGION }); - const useFifoQueueEnv = process.env.USE_FIFO_QUEUE || 'false'; + const useFifoQueueEnv = process.env.SQS_IS_FIFO || 'false'; const useFifoQueue = JSON.parse(useFifoQueueEnv) as boolean; const sqsMessage: SQS.Types.SendMessageRequest = { diff --git a/modules/webhook/variables.tf b/modules/webhook/variables.tf index 5a767fc1b8..1eb17c9d38 100644 --- a/modules/webhook/variables.tf +++ b/modules/webhook/variables.tf @@ -126,3 +126,9 @@ variable "disable_check_wokflow_job_labels" { type = bool default = false } + +variable "sqs_build_queue_fifo" { + description = "Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners." + type = bool + default = false +} diff --git a/modules/webhook/webhook.tf b/modules/webhook/webhook.tf index 1115b985f1..8ea42b2d28 100644 --- a/modules/webhook/webhook.tf +++ b/modules/webhook/webhook.tf @@ -19,6 +19,7 @@ resource "aws_lambda_function" "webhook" { REPOSITORY_WHITE_LIST = jsonencode(var.repository_white_list) RUNNER_LABELS = jsonencode(split(",", var.runner_extra_labels)) SQS_URL_WEBHOOK = var.sqs_build_queue.id + SQS_IS_FIFO = var.sqs_build_queue_fifo } } From bfff210b62afda175bd60b5ffa5ec538fb95bbbe Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 18:17:48 +0100 Subject: [PATCH 17/31] sync develop --- .../lambdas/runners/src/scale-runners/scale-up.ts | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index d2557b0955..f6e11e4728 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -46,17 +46,6 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage logger.info(`Received event`, LogFields.print()); - const runnerType = enableOrgLevel ? 'Org' : 'Repo'; - const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`; - - LogFields.fields = {}; - LogFields.fields.runnerType = runnerType; - LogFields.fields.runnerOwner = runnerOwner; - LogFields.fields.event = payload.eventType; - LogFields.fields.id = payload.id.toString(); - - logger.info(`Received event`, LogFields.print()); - let ghesApiUrl = ''; if (ghesBaseUrl) { ghesApiUrl = `${ghesBaseUrl}/api/v3`; From f0e98805b7c9796f2aac6240b1424c96889d0392 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 19:11:14 +0100 Subject: [PATCH 18/31] review fix Co-authored-by: Scott Guymer --- modules/runners/lambdas/runners/src/lambda.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index 23a7d7089a..6fc49c4abf 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -9,7 +9,7 @@ export async function scaleUpHandler(event: SQSEvent, context: Context): Promise logger.setSettings({ requestId: context.awsRequestId }); logger.debug(JSON.stringify(event)); if (event.Records.length != 1) { - logger.warn('Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.'); + logger.warn('Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.'); return new Promise((resolve) => resolve()); } From 5cda2947fca6b4b671e8c20aa1e6b6ae920a3917 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 19:11:52 +0100 Subject: [PATCH 19/31] review fix Co-authored-by: Scott Guymer --- modules/runners/lambdas/runners/src/scale-runners/scale-up.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index f6e11e4728..4e38071f57 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -28,7 +28,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); if (ephemeralEnabled && payload.eventType != 'workflow_job') { - logger.warn(`${payload.eventType} even is not supported in combination with ephemeral runners.`); + logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`); throw Error( `The workflow_job type ${payload.eventType} is not supported in combination with ephemeral runners.` + `Please ensure you have enabled workflow_job events.`, From 4acb13787656f18f3e18a05d242fa5e431261809 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 19:12:07 +0100 Subject: [PATCH 20/31] review vfix Co-authored-by: Scott Guymer --- modules/runners/lambdas/runners/src/scale-runners/scale-up.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 4e38071f57..a02ad3402a 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -30,7 +30,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage if (ephemeralEnabled && payload.eventType != 'workflow_job') { logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`); throw Error( - `The workflow_job type ${payload.eventType} is not supported in combination with ephemeral runners.` + + `The event type ${payload.eventType} is not supported in combination with ephemeral runners.` + `Please ensure you have enabled workflow_job events.`, ); } From 7b074c842c3765af488813d525748cd6d71a5dbe Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 19:12:25 +0100 Subject: [PATCH 21/31] review vfix Co-authored-by: Scott Guymer --- .../runners/lambdas/runners/src/scale-runners/scale-up.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index b472911edc..835433151e 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -543,7 +543,7 @@ describe('scaleUp with public GH', () => { expect(createRunner).toHaveBeenNthCalledWith(2, expectedRunnerParams, 'lt-2'); }); - it('ephemeral runners cannot only run with workflow_job event, others should fail.', async () => { + it('ephemeral runners only run with workflow_job event, others should fail.', async () => { process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; await expect( scaleUpModule.scaleUp('aws:sqs', { From 3b8f28a355684bae6a01805b845d186b1673a890 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Fri, 10 Dec 2021 19:15:03 +0100 Subject: [PATCH 22/31] fix review --- examples/default/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/default/main.tf b/examples/default/main.tf index f38be98f6d..46e57fc183 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -63,7 +63,7 @@ module "runners" { instance_types = ["m5.large", "c5.large"] # override delay of events in seconds - delay_webhook_event = 10 + delay_webhook_event = 5 runners_maximum_count = 1 # set up a fifo queue to remain order From 2dd5848cb5c6fd8f2260dd9f871d38cd26b95e31 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Mon, 13 Dec 2021 08:17:04 +0100 Subject: [PATCH 23/31] process review comments --- modules/runners/lambdas/runners/src/lambda.test.ts | 2 +- modules/runners/lambdas/runners/src/lambda.ts | 2 +- modules/runners/lambdas/runners/src/scale-runners/runners.ts | 2 +- modules/runners/lambdas/runners/src/scale-runners/scale-up.ts | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/runners/lambdas/runners/src/lambda.test.ts b/modules/runners/lambdas/runners/src/lambda.test.ts index f1a916372e..f77cfac0b1 100644 --- a/modules/runners/lambdas/runners/src/lambda.test.ts +++ b/modules/runners/lambdas/runners/src/lambda.test.ts @@ -111,7 +111,7 @@ async function testInvalidRecords(sqsRecords: SQSRecord[]) { await expect(scaleUpHandler(sqsEventMultipleRecords, context)).resolves; expect(logWarnSpy).toHaveBeenCalledWith( - 'Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.', + 'Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.', ); } diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index 6fc49c4abf..9fc0e783c8 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -8,7 +8,7 @@ import 'source-map-support/register'; export async function scaleUpHandler(event: SQSEvent, context: Context): Promise { logger.setSettings({ requestId: context.awsRequestId }); logger.debug(JSON.stringify(event)); - if (event.Records.length != 1) { + if (event.Records.length !== 1) { logger.warn('Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.'); return new Promise((resolve) => resolve()); } diff --git a/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/modules/runners/lambdas/runners/src/scale-runners/runners.ts index ba2409467c..9670f4e025 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/runners.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/runners.ts @@ -90,7 +90,7 @@ export async function createRunner(runnerParameters: RunnerInputParameters, laun LogFields.print(), ); const ssm = new SSM(); - if (runInstancesResponse.Instances != undefined) { + if (runInstancesResponse.Instances) { for (let i = 0; i < runInstancesResponse.Instances?.length; i++) { await ssm .putParameter({ diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index a02ad3402a..436005c157 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -27,7 +27,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const ghesBaseUrl = process.env.GHES_URL; const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); - if (ephemeralEnabled && payload.eventType != 'workflow_job') { + if (ephemeralEnabled && payload.eventType !== 'workflow_job') { logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`); throw Error( `The event type ${payload.eventType} is not supported in combination with ephemeral runners.` + From 6c6696589d87bfecaf09d2555751341036d75057 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Mon, 13 Dec 2021 08:19:08 +0100 Subject: [PATCH 24/31] process review comments --- modules/runners/lambdas/runners/src/lambda.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index 9fc0e783c8..183f0ac502 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -20,7 +20,7 @@ export async function scaleUpHandler(event: SQSEvent, context: Context): Promise if (e instanceof ScaleError) { reject(e); } else { - logger.warn('Ignoring error: ', e); + logger.warn(`Ignoring error: ${e.message}`); } }); }); From abeac1a206d4551ee5ff56fc7716deafeb94e63b Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Mon, 13 Dec 2021 09:58:02 +0100 Subject: [PATCH 25/31] review comment --- .../lambdas/runners/src/scale-runners/scale-up.test.ts | 2 +- .../lambdas/runners/src/scale-runners/scale-up.ts | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index 835433151e..6bd0449f0b 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -556,7 +556,7 @@ describe('scaleUp with public GH', () => { it('creates a ephemeral runner.', async () => { process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --ephemeral`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --ephemeral`; expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); }); diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 436005c157..6eca9b43b2 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -91,16 +91,16 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage }); const token = registrationToken.data.token; - const labelsArgument = runnerExtraLabels !== undefined ? `--labels ${runnerExtraLabels}` : ''; - const runnerGroupArgument = runnerGroup !== undefined ? `--runnergroup ${runnerGroup}` : ''; + const labelsArgument = runnerExtraLabels !== undefined ? `--labels ${runnerExtraLabels} ` : ''; + const runnerGroupArgument = runnerGroup !== undefined ? `--runnergroup ${runnerGroup} ` : ''; const configBaseUrl = ghesBaseUrl ? ghesBaseUrl : 'https://github.com'; - const ephemeralArgument = ephemeral ? '--ephemeral' : ''; - const runnerArgs = `--token ${token} ${labelsArgument} ${ephemeralArgument}`.trim(); + const ephemeralArgument = ephemeral ? '--ephemeral ' : ''; + const runnerArgs = `--token ${token} ${labelsArgument}${ephemeralArgument}`; await createRunnerLoop({ environment, runnerServiceConfig: enableOrgLevel - ? `--url ${configBaseUrl}/${payload.repositoryOwner} ${runnerArgs} ${runnerGroupArgument}`.trim() + ? `--url ${configBaseUrl}/${payload.repositoryOwner} ${runnerArgs}${runnerGroupArgument}`.trim() : `--url ${configBaseUrl}/${payload.repositoryOwner}/${payload.repositoryName} ${runnerArgs}`.trim(), runnerOwner, runnerType, From 17dba00ccf3189e337ff47d09bd8b59aa4a7af59 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Tue, 14 Dec 2021 15:20:21 +0100 Subject: [PATCH 26/31] process review comments --- README.md | 5 ++--- examples/ephemeral/README.md | 3 +-- main.tf | 11 +++++------ outputs.tf | 8 ++++++-- variables.tf | 12 +++++------- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 1c772a3eab..73747159cf 100644 --- a/README.md +++ b/README.md @@ -299,7 +299,6 @@ Examples are located in the [examples](./examples) directory. The following exam - _[Ubuntu](examples/ubuntu/README.md)_: Example usage of creating a runner using Ubuntu AMIs. - _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. - _[Windows](examples/windows/README.md)_: Example usage of creating a runner using Windows as the OS. -- _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. ## Sub modules @@ -409,7 +408,7 @@ In case the setup does not work as intended follow the trace of events: | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | | [market\_options](#input\_market\_options) | Market options for the action runner instances. Setting the value to `null` let the scaler create on-demand instances instead of spot instances. | `string` | `"spot"` | no | | [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated if not busy. | `number` | `null` | no | -| [redrive\_build\_queue](#input\_redrive\_build\_queue) | Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries, and `deadLetterTargetArn` to null for letting the module create a queue. Or otherwise provide you own queue by setting an ARN.` |
object({
enabled = bool
maxReceiveCount = number
deadLetterTargetArn = string
})
|
{
"deadLetterTargetArn": null,
"enabled": false,
"maxReceiveCount": null
}
| no | +| [redrive\_build\_queue](#input\_redrive\_build\_queue) | Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries.` |
object({
enabled = bool
maxReceiveCount = number
})
|
{
"enabled": false,
"maxReceiveCount": null
}
| no | | [repository\_white\_list](#input\_repository\_white\_list) | List of repositories allowed to use the github app | `list(string)` | `[]` | no | | [role\_path](#input\_role\_path) | The path that will be added to role path for created roles, if not set the environment name will be used. | `string` | `null` | no | | [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created roles. | `string` | `null` | no | @@ -455,7 +454,7 @@ In case the setup does not work as intended follow the trace of events: | Name | Description | |------|-------------| | [binaries\_syncer](#output\_binaries\_syncer) | n/a | -| [name](#output\_name) | n/a | +| [queues](#output\_queues) | SQS queues. | | [runners](#output\_runners) | n/a | | [ssm\_parameters](#output\_ssm\_parameters) | n/a | | [webhook](#output\_webhook) | n/a | diff --git a/examples/ephemeral/README.md b/examples/ephemeral/README.md index b2b177bc9f..573cd1ffa8 100644 --- a/examples/ephemeral/README.md +++ b/examples/ephemeral/README.md @@ -1,7 +1,6 @@ # Action runners deployment ephemeral example -This module shows how to create GitHub action runners. Lambda release will be downloaded from GitHub. - +This example is essential based on the default setup. But shows how you can use runners with the ephemeral flag enabled. Once you enable ephemeral a runner will used once, only for one job. This requires that each job needs a fresh instance. To feature should be used in combination with event `workflow_job`, see your GitHub webhook endpoint configuration. We also suggest to use the feature with a pre-build AMI, to optimize the creation time of a runner. ## Usages Steps for the full setup, such as creating a GitHub app can be found in the root module's [README](../../README.md). First download the Lambda releases from GitHub. Alternatively you can build the lambdas locally with Node or Docker, there is a simple build script in `/.ci/build.sh`. In the `main.tf` you can simply remove the location of the lambda zip files, the default location will work in this case. diff --git a/main.tf b/main.tf index c2d5cab8e9..4d9f60dc97 100644 --- a/main.tf +++ b/main.tf @@ -18,10 +18,6 @@ resource "random_string" "random" { upper = false } -locals { - build_queue_dead_letter_arn = var.redrive_build_queue.enabled && var.redrive_build_queue.deadLetterTargetArn == null ? aws_sqs_queue.queued_builds_dlq[0].arn : var.redrive_build_queue.deadLetterTargetArn - redrive_policy = var.redrive_build_queue.enabled ? { deadLetterTargetArn = local.build_queue_dead_letter_arn, maxReceiveCount = var.redrive_build_queue.maxReceiveCount } : null -} resource "aws_sqs_queue" "queued_builds" { name = "${var.environment}-queued-builds${var.fifo_build_queue ? ".fifo" : ""}" delay_seconds = var.delay_webhook_event @@ -30,13 +26,16 @@ resource "aws_sqs_queue" "queued_builds" { fifo_queue = var.fifo_build_queue receive_wait_time_seconds = 0 content_based_deduplication = var.fifo_build_queue - redrive_policy = local.redrive_policy == null ? null : jsonencode(local.redrive_policy) + redrive_policy = var.redrive_build_queue.enabled ? jsonencode({ + deadLetterTargetArn = aws_sqs_queue.queued_builds_dlq[0].arn, + maxReceiveCount = var.redrive_build_queue.maxReceiveCount + }) : null tags = var.tags } resource "aws_sqs_queue" "queued_builds_dlq" { - count = var.redrive_build_queue.enabled && var.redrive_build_queue.deadLetterTargetArn == null ? 1 : 0 + count = var.redrive_build_queue.enabled ? 1 : 0 name = "${var.environment}-queued-builds_dead_letter" tags = var.tags diff --git a/outputs.tf b/outputs.tf index f4d4408ae0..8dcc418b77 100644 --- a/outputs.tf +++ b/outputs.tf @@ -34,6 +34,10 @@ output "ssm_parameters" { } -output "name" { - value = aws_sqs_queue.queued_builds_dlq.* +output "queues" { + description = "SQS queues." + value = { + build_queue_arn = aws_sqs_queue.queued_builds.arn + build_queue_dlq_arn = var.redrive_build_queue.enabled ? aws_sqs_queue.queued_builds_dlq[0].arn : null + } } diff --git a/variables.tf b/variables.tf index 3f615ad37f..2dbb2b80d7 100644 --- a/variables.tf +++ b/variables.tf @@ -486,16 +486,14 @@ variable "fifo_build_queue" { } variable "redrive_build_queue" { - description = "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries, and `deadLetterTargetArn` to null for letting the module create a queue. Or otherwise provide you own queue by setting an ARN." + description = "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries." type = object({ - enabled = bool - maxReceiveCount = number - deadLetterTargetArn = string + enabled = bool + maxReceiveCount = number }) default = { - enabled = false - maxReceiveCount = null - deadLetterTargetArn = null + enabled = false + maxReceiveCount = null } validation { condition = var.redrive_build_queue.enabled && var.redrive_build_queue.maxReceiveCount != null || !var.redrive_build_queue.enabled From ec7af4c64dc950f1f54f88a79ca11170b62ed1b7 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Tue, 14 Dec 2021 16:23:58 +0100 Subject: [PATCH 27/31] Update examples/ephemeral/README.md Co-authored-by: Nathaniel McAuliffe --- examples/ephemeral/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ephemeral/README.md b/examples/ephemeral/README.md index 573cd1ffa8..0eec98561d 100644 --- a/examples/ephemeral/README.md +++ b/examples/ephemeral/README.md @@ -1,6 +1,6 @@ # Action runners deployment ephemeral example -This example is essential based on the default setup. But shows how you can use runners with the ephemeral flag enabled. Once you enable ephemeral a runner will used once, only for one job. This requires that each job needs a fresh instance. To feature should be used in combination with event `workflow_job`, see your GitHub webhook endpoint configuration. We also suggest to use the feature with a pre-build AMI, to optimize the creation time of a runner. +This example is based on the default setup, but shows how runners can be used with the ephemeral flag enabled. Once enabled, ephemeral runners will be used for one job only. Each job requires a fresh instance. This feature should be used in combination with the `workflow_job` event. See GitHub webhook endpoint configuration(link needed here). It is also suggested to use a pre-build AMI to minimize runner launch times. ## Usages Steps for the full setup, such as creating a GitHub app can be found in the root module's [README](../../README.md). First download the Lambda releases from GitHub. Alternatively you can build the lambdas locally with Node or Docker, there is a simple build script in `/.ci/build.sh`. In the `main.tf` you can simply remove the location of the lambda zip files, the default location will work in this case. From e08bd3664d567352be14f5a16161c9b7f02b608f Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Tue, 14 Dec 2021 23:16:44 +0100 Subject: [PATCH 28/31] Process review comments --- .../lambdas/runners/src/lambda.test.ts | 1 + modules/runners/lambdas/runners/src/lambda.ts | 40 +++++++++---------- .../runners/src/scale-runners/scale-up.ts | 10 ++++- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/modules/runners/lambdas/runners/src/lambda.test.ts b/modules/runners/lambdas/runners/src/lambda.test.ts index f77cfac0b1..43f5ffdff3 100644 --- a/modules/runners/lambdas/runners/src/lambda.test.ts +++ b/modules/runners/lambdas/runners/src/lambda.test.ts @@ -112,6 +112,7 @@ async function testInvalidRecords(sqsRecords: SQSRecord[]) { expect(logWarnSpy).toHaveBeenCalledWith( 'Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.', + undefined, ); } diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index 183f0ac502..20e1c40135 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -1,7 +1,7 @@ import { scaleUp } from './scale-runners/scale-up'; import { scaleDown } from './scale-runners/scale-down'; import { SQSEvent, ScheduledEvent, Context, Callback } from 'aws-lambda'; -import { logger } from './scale-runners/logger'; +import { LogFields, logger } from './scale-runners/logger'; import ScaleError from './scale-runners/ScaleError'; import 'source-map-support/register'; @@ -9,32 +9,30 @@ export async function scaleUpHandler(event: SQSEvent, context: Context): Promise logger.setSettings({ requestId: context.awsRequestId }); logger.debug(JSON.stringify(event)); if (event.Records.length !== 1) { - logger.warn('Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.'); + logger.warn( + 'Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.', + LogFields.print(), + ); return new Promise((resolve) => resolve()); } - return new Promise((resolve, reject) => { - scaleUp(event.Records[0].eventSource, JSON.parse(event.Records[0].body)) - .then(() => resolve()) - .catch((e: Error) => { - if (e instanceof ScaleError) { - reject(e); - } else { - logger.warn(`Ignoring error: ${e.message}`); - } - }); - }); + try { + await scaleUp(event.Records[0].eventSource, JSON.parse(event.Records[0].body)); + } catch (e) { + if (e instanceof ScaleError) { + throw e; + } else { + logger.warn(`Ignoring error: ${(e as Error).message}`, LogFields.print()); + } + } } export async function scaleDownHandler(event: ScheduledEvent, context: Context): Promise { logger.setSettings({ requestId: context.awsRequestId }); - return new Promise((resolve) => { - scaleDown() - .then(() => resolve()) - .catch((e) => { - logger.error(e); - resolve(); - }); - }); + try { + await scaleDown(); + } catch (e) { + logger.error(e); + } } diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 6eca9b43b2..8422c09540 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -16,7 +16,10 @@ export interface ActionRequestMessage { } export async function scaleUp(eventSource: string, payload: ActionRequestMessage): Promise { - logger.info(`Received ${payload.eventType} from ${payload.repositoryOwner}/${payload.repositoryName}`); + logger.info( + `Received ${payload.eventType} from ${payload.repositoryOwner}/${payload.repositoryName}`, + LogFields.print(), + ); if (eventSource !== 'aws:sqs') throw Error('Cannot handle non-SQS events!'); const enableOrgLevel = yn(process.env.ENABLE_ORGANIZATION_RUNNERS, { default: true }); @@ -28,7 +31,10 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); if (ephemeralEnabled && payload.eventType !== 'workflow_job') { - logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`); + logger.warn( + `${payload.eventType} event is not supported in combination with ephemeral runners.`, + LogFields.print(), + ); throw Error( `The event type ${payload.eventType} is not supported in combination with ephemeral runners.` + `Please ensure you have enabled workflow_job events.`, From 7c9f512ee2f59169f675b78836618ebc79113a55 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 22 Dec 2021 09:22:40 +0100 Subject: [PATCH 29/31] Add docs --- README.md | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 73747159cf..3c0cb7c6a4 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,16 @@ This [Terraform](https://www.terraform.io/) module creates the required infrastructure needed to host [GitHub Actions](https://github.com/features/actions) self hosted, auto scaling runners on [AWS spot instances](https://aws.amazon.com/ec2/spot/). It provides the required logic to handle the life cycle for scaling up and down using a set of AWS Lambda functions. Runners are scaled down to zero to avoid costs when no workflows are active. +> NEW: Ephemeral runners available as beta feature. + +> NEW: Windows runners are available. + +> NEW: Examples for custom AMI are available. + - [Motivation](#motivation) - [Overview](#overview) - - [ARM64 support via Graviton/Graviton2 instance-types](#arm64-support-via-gravitongraviton2-instance-types) + - [Major configuration options.](#major-configuration-options) + - [ARM64 support via Graviton/Graviton2 instance-types](#arm64-support-via-gravitongraviton2-instance-types) - [Usages](#usages) - [Setup GitHub App (part 1)](#setup-github-app-part-1) - [Setup terraform module](#setup-terraform-module) @@ -16,6 +23,7 @@ This [Terraform](https://www.terraform.io/) module creates the required infrastr - [Install app](#install-app) - [Encryption](#encryption) - [Idle runners](#idle-runners) + - [Ephemeral runners](#ephemeral-runners) - [Prebuilt Images](#prebuilt-images) - [Examples](#examples) - [Sub modules](#sub-modules) @@ -48,7 +56,6 @@ For receiving the `check_run` or `workflow_job` event by the webhook (lambda) a - `check_run`: create a webhook on enterprise, org, repo or app level. When using the app option, the app needs to be installed to repo's are using the self-hosted runners. - a Webhook needs to be created. The webhook hook can be defined on enterprise, org, repo, or app level. - In AWS a [API gateway](https://docs.aws.amazon.com/apigateway/index.html) endpoint is created that is able to receive the GitHub webhook events via HTTP post. The gateway triggers the webhook lambda which will verify the signature of the event. This check guarantees the event is sent by the GitHub App. The lambda only handles `workflow_job` or `check_run` events with status `queued` and matching the runner labels (only for `workflow_job`). The accepted events are posted on a SQS queue. Messages on this queue will be delayed for a configurable amount of seconds (default 30 seconds) to give the available runners time to pick up this build. The "scale up runner" lambda is listening to the SQS queue and picks up events. The lambda runs various checks to decide whether a new EC2 spot instance needs to be created. For example, the instance is not created if the build is already started by an existing runner, or the maximum number of runners is reached. @@ -71,7 +78,18 @@ Permission are managed on several places. Below the most important ones. For det Besides these permissions, the lambdas also need permission to CloudWatch (for logging and scheduling), SSM and S3. For more details about the required permissions see the [documentation](./modules/setup-iam-permissions/README.md) of the IAM module which uses permission boundaries. -### ARM64 support via Graviton/Graviton2 instance-types +### Major configuration options. + +To be able to support a number of use-cases the module has quite a lot configuration options. We try to choose reasonable defaults. The several examples also shows for the main cases how to configure the runners. + +- Org vs Repo level. You can configure the module to connect the runners in GitHub on a org level and share the runners in your org. Or set the runners on repo level. The module will install the runner to the repo. This can be multiple repo's but runners are not shared between repo's. +- Checkrun vs Workflow job event. You can configure the webhook in GitHub to send checkrun or workflow job events to the webhook. Workflow job events are introduced by GitHub in September 2021 and are designed to support scalable runners. We advise when possible to use the workflow job event, you can set `disable_check_wokflow_job_labels = true` to disable the label check. +- Linux vs Windows. you can configure the os types linux and win. Linux will be used by default. +- Re-use vs Ephemeral. By default runners are re-used for till detected idle, once idle they will be removed from the pool. To improve security we are introducing ephemeral runners. Those runners are only used for one job. Ephemeral runners are only working in combination with the workflow job event. We also suggest to use a pre-build AMI to improve the start time of jobs. +- GitHub cloud vs GitHub enterprise server (GHES). The runner support GitHub cloud as well GitHub enterprise service. For GHES we rely on our community to test and support. We have no possibility to test ourselves on GHES. + + +#### ARM64 support via Graviton/Graviton2 instance-types When using the default example or top-level module, specifying an `instance_type` that matches a Graviton/Graviton 2 (ARM64) architecture (e.g. a1 or any 6th-gen `g` or `gd` type), the sub-modules will be automatically configured to provision with ARM64 AMIs and leverage GitHub's ARM64 action runner. See below for more details. @@ -268,6 +286,16 @@ idle_config = [{ _**Note**_: When using Windows runners it's recommended to keep a few runners warmed up due to the minutes-long cold start time. +### Ephemeral runners + +Currently a beta feature! You can configure runners to be ephemeral, runners will be used only for one job. The feature should be used in conjunction with listening for the workflow job event. Please consider the following: + +- The scale down lambda is still active, and should only remove orphan instances. But there is no strict check in place. So ensure you configure the `minimum_running_time_in_minutes` to a value that is high enough to got your runner booted and connected to avoid it got terminated before executing a job. +- The messages sent from the webhook lambda to scale-up lambda are by default delayed delayed by SQS, to give available runners to option to start the job before the decision is made to scale more runners. For ephemeral runners there is no need to wait. Set `delay_webhook_event` to `0` +- Error related to scaling should be retried via SQS. You can configure `job_queue_retention_in_seconds` `redrive_build_queue` to tune the behavior. We have no mechanism to avoid events will never processed, which means potential no runner could be created and the job in GitHub can time out in 6 hours. + +The example for [ephemeral runners](./examples/ephemeral) is based on the [default example](./examples/default). Have look on the diff to see the major configuration differences. + ### Prebuilt Images This module also allows you to run agents from a prebuilt AMI to gain faster startup times. You can find more information in [the image README.md](/images/README.md) @@ -295,10 +323,11 @@ For time zones please check [TZ database name column](https://en.wikipedia.org/w Examples are located in the [examples](./examples) directory. The following examples are provided: - _[Default](examples/default/README.md)_: The default example of the module -- _[Permissions boundary](examples/permissions-boundary/README.md)_: Example usages of permissions boundaries. - _[Ubuntu](examples/ubuntu/README.md)_: Example usage of creating a runner using Ubuntu AMIs. -- _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. - _[Windows](examples/windows/README.md)_: Example usage of creating a runner using Windows as the OS. +- _[Ephemeral](examples/ephemeral/README.md) : Example usages of ephemeral runners based on the default example. +- _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. +- _[Permissions boundary](examples/permissions-boundary/README.md)_: Example usages of permissions boundaries. ## Sub modules From 49f15a0bc31792addc25ef9d3dbab738108804a8 Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 22 Dec 2021 11:22:51 +0100 Subject: [PATCH 30/31] review comments --- modules/runners/policies/instance-ec2.json | 2 +- modules/webhook/lambdas/webhook/src/sqs/index.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/runners/policies/instance-ec2.json b/modules/runners/policies/instance-ec2.json index 1757552dd8..4a5bc578f5 100644 --- a/modules/runners/policies/instance-ec2.json +++ b/modules/runners/policies/instance-ec2.json @@ -12,4 +12,4 @@ } } ] -} \ No newline at end of file +} diff --git a/modules/webhook/lambdas/webhook/src/sqs/index.ts b/modules/webhook/lambdas/webhook/src/sqs/index.ts index 2217718f7d..63a2a240f2 100644 --- a/modules/webhook/lambdas/webhook/src/sqs/index.ts +++ b/modules/webhook/lambdas/webhook/src/sqs/index.ts @@ -1,5 +1,5 @@ import { SQS } from 'aws-sdk'; -import { logger as logger } from '../webhook/logger'; +import { LogFields, logger as logger } from '../webhook/logger'; export interface ActionRequestMessage { id: number; @@ -20,7 +20,7 @@ export const sendActionRequest = async (message: ActionRequestMessage): Promise< MessageBody: JSON.stringify(message), }; - logger.debug(`sending message to SQS: ${JSON.stringify(sqsMessage)}`); + logger.debug(`sending message to SQS: ${JSON.stringify(sqsMessage)}`, LogFields.print()); if (useFifoQueue) { sqsMessage.MessageGroupId = String(message.id); } From 32690199b9190c7c07fe2606821df4b1a24c043c Mon Sep 17 00:00:00 2001 From: Niek Palm Date: Wed, 22 Dec 2021 11:40:17 +0100 Subject: [PATCH 31/31] update docs --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c0cb7c6a4..86354d19cd 100644 --- a/README.md +++ b/README.md @@ -291,7 +291,8 @@ _**Note**_: When using Windows runners it's recommended to keep a few runners wa Currently a beta feature! You can configure runners to be ephemeral, runners will be used only for one job. The feature should be used in conjunction with listening for the workflow job event. Please consider the following: - The scale down lambda is still active, and should only remove orphan instances. But there is no strict check in place. So ensure you configure the `minimum_running_time_in_minutes` to a value that is high enough to got your runner booted and connected to avoid it got terminated before executing a job. -- The messages sent from the webhook lambda to scale-up lambda are by default delayed delayed by SQS, to give available runners to option to start the job before the decision is made to scale more runners. For ephemeral runners there is no need to wait. Set `delay_webhook_event` to `0` +- The messages sent from the webhook lambda to scale-up lambda are by default delayed delayed by SQS, to give available runners to option to start the job before the decision is made to scale more runners. For ephemeral runners there is no need to wait. Set `delay_webhook_event` to `0`. +- To ensure runners are created in the same order GitHub sends the events we use by default a FIFO queue, this is mainly relevant for repo level runners. For ephemeral runners you can set `fifo_build_queue` to `false`. - Error related to scaling should be retried via SQS. You can configure `job_queue_retention_in_seconds` `redrive_build_queue` to tune the behavior. We have no mechanism to avoid events will never processed, which means potential no runner could be created and the job in GitHub can time out in 6 hours. The example for [ephemeral runners](./examples/ephemeral) is based on the [default example](./examples/default). Have look on the diff to see the major configuration differences.