diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 75b9d265870a..199743323a2e 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -36,8 +36,9 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. -# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] + +# available frameworks - ["base", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. @@ -46,7 +47,7 @@ build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures @@ -71,13 +72,19 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true +### Set enable_ipv6 = true to run tests with IPv6-enabled resources +### Off by default (set to false) +enable_ipv6 = true +### Set the VPC name to be used for IPv6 testing +### Empty by default +ipv6_vpc_name = "dlc-ipv6-test-vpc" ### SM specific tests ### On by default -sagemaker_local_tests = true +sagemaker_local_tests = false # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = true +sagemaker_remote_tests = false # run efa sagemaker tests sagemaker_efa_tests = false # run release_candidate_integration tests @@ -104,7 +111,7 @@ use_scheduler = false ### TRAINING PR JOBS ### # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-6-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" @@ -134,7 +141,7 @@ dlc-pr-tensorflow-2-habana-training = "" ### INFERENCE PR JOBS ### # Standard Framework Inference -dlc-pr-pytorch-inference = "" +dlc-pr-pytorch-inference = "pytorch/inference/buildspec-2-6-sm.yml" dlc-pr-tensorflow-2-inference = "" dlc-pr-autogluon-inference = "" diff --git a/pytorch/inference/buildspec-2-4-ec2.yml b/pytorch/inference/buildspec-2-4-ec2.yml index 1118c3110690..d2bad7c45ca5 100644 --- a/pytorch/inference/buildspec-2-4-ec2.yml +++ b/pytorch/inference/buildspec-2-4-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -44,6 +44,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.4.0-cpu-py311-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -61,6 +62,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: diff --git a/pytorch/inference/buildspec-2-4-sm.yml b/pytorch/inference/buildspec-2-4-sm.yml index 8fb0b8260a42..710b0b122d97 100644 --- a/pytorch/inference/buildspec-2-4-sm.yml +++ b/pytorch/inference/buildspec-2-4-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -45,6 +45,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -63,6 +64,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/inference/buildspec-2-5-ec2.yml b/pytorch/inference/buildspec-2-5-ec2.yml index a34fb55735f8..8cd21091ec07 100644 --- a/pytorch/inference/buildspec-2-5-ec2.yml +++ b/pytorch/inference/buildspec-2-5-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -44,6 +44,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.5.0-cpu-py311-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -61,6 +62,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.5.0-gpu-py311-cu124-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: diff --git a/pytorch/inference/buildspec-2-5-sm.yml b/pytorch/inference/buildspec-2-5-sm.yml index 0bba3d6138f4..cdf12138a82e 100644 --- a/pytorch/inference/buildspec-2-5-sm.yml +++ b/pytorch/inference/buildspec-2-5-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -45,6 +45,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -63,6 +64,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/inference/buildspec-2-6-ec2.yml b/pytorch/inference/buildspec-2-6-ec2.yml index dc825393378d..07e180e2e678 100644 --- a/pytorch/inference/buildspec-2-6-ec2.yml +++ b/pytorch/inference/buildspec-2-6-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -44,6 +44,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "False" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -61,6 +62,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "False" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: diff --git a/pytorch/inference/buildspec-2-6-sm.yml b/pytorch/inference/buildspec-2-6-sm.yml index 3b1135a26856..1ba01ab613db 100644 --- a/pytorch/inference/buildspec-2-6-sm.yml +++ b/pytorch/inference/buildspec-2-6-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -48,6 +48,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -66,6 +67,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/training/buildspec-2-4-ec2.yml b/pytorch/training/buildspec-2-4-ec2.yml index 88bbcb846f8f..9200fdb56c94 100644 --- a/pytorch/training/buildspec-2-4-ec2.yml +++ b/pytorch/training/buildspec-2-4-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -42,7 +42,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -58,7 +58,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/pytorch/training/buildspec-2-4-sm.yml b/pytorch/training/buildspec-2-4-sm.yml index 994115f4029c..e5c9826b7385 100644 --- a/pytorch/training/buildspec-2-4-sm.yml +++ b/pytorch/training/buildspec-2-4-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -43,6 +43,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -59,6 +60,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/training/buildspec-2-5-ec2.yml b/pytorch/training/buildspec-2-5-ec2.yml index 39438d1ed034..742daab8aa03 100644 --- a/pytorch/training/buildspec-2-5-ec2.yml +++ b/pytorch/training/buildspec-2-5-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -42,7 +42,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "true" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -58,7 +58,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "true" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/pytorch/training/buildspec-2-5-sm.yml b/pytorch/training/buildspec-2-5-sm.yml index e3b305db5738..5f32b6835ba9 100644 --- a/pytorch/training/buildspec-2-5-sm.yml +++ b/pytorch/training/buildspec-2-5-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -49,6 +49,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.1-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -65,6 +66,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.1-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/training/buildspec-2-6-ec2.yml b/pytorch/training/buildspec-2-6-ec2.yml index 6e55dba3a167..402c1879ca0c 100644 --- a/pytorch/training/buildspec-2-6-ec2.yml +++ b/pytorch/training/buildspec-2-6-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -48,7 +48,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -64,7 +64,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/pytorch/training/buildspec-2-6-sm.yml b/pytorch/training/buildspec-2-6-sm.yml index 2e2199ec48be..cc7477d83157 100644 --- a/pytorch/training/buildspec-2-6-sm.yml +++ b/pytorch/training/buildspec-2-6-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -48,7 +48,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -64,7 +64,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # build_tag_override: "beta:2.6.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/src/config.py b/src/config.py index a485b402f14b..af84ba81f309 100644 --- a/src/config.py +++ b/src/config.py @@ -75,6 +75,10 @@ def are_heavy_instance_ec2_tests_enabled(): return parse_dlc_developer_configs("test", "ec2_tests_on_heavy_instances") +def is_ipv6_test_enabled(): + return parse_dlc_developer_configs("test", "enable_ipv6") + + def is_ecs_test_enabled(): return parse_dlc_developer_configs("test", "ecs_tests") @@ -158,6 +162,13 @@ def get_notification_severity(): return AllowedNotificationSeverity.MEDIUM.value +def get_ipv6_vpc_name(): + """ + Get the config value for ipv6_vpc_name + """ + return parse_dlc_developer_configs("test", "ipv6_vpc_name") + + def get_sagemaker_remote_efa_instance_type(): """ Get the config value for sagemaker_remote_efa_instance_type diff --git a/src/start_testbuilds.py b/src/start_testbuilds.py index c1540d8e8541..c368756aa03c 100644 --- a/src/start_testbuilds.py +++ b/src/start_testbuilds.py @@ -51,6 +51,9 @@ def run_test_job(commit, codebuild_project, images_str=""): config.are_heavy_instance_ec2_tests_enabled() and "ec2" in codebuild_project ) + # For EC2 tests, enable IPv6 testing when config is enabled + is_ipv6_test_enabled = config.is_ipv6_test_enabled() and "ec2" in codebuild_project + if config.is_deep_canary_mode_enabled(): env_overrides.append({"name": "DEEP_CANARY_MODE", "value": "true", "type": "PLAINTEXT"}) @@ -85,11 +88,21 @@ def run_test_job(commit, codebuild_project, images_str=""): "value": config.get_sagemaker_remote_efa_instance_type(), "type": "PLAINTEXT", }, + { + "name": "IPV6_VPC_NAME", + "value": config.get_ipv6_vpc_name(), + "type": "PLAINTEXT", + }, { "name": "HEAVY_INSTANCE_EC2_TESTS_ENABLED", "value": str(are_heavy_instance_ec2_tests_enabled), "type": "PLAINTEXT", }, + { + "name": "ENABLE_IPV6_TESTING", + "value": str(is_ipv6_test_enabled), + "type": "PLAINTEXT", + }, { "name": "FRAMEWORK_BUILDSPEC_FILE", "value": config.get_buildspec_override() or os.getenv("FRAMEWORK_BUILDSPEC_FILE"), diff --git a/tensorflow/inference/buildspec-2-18-ec2.yml b/tensorflow/inference/buildspec-2-18-ec2.yml index 6607c4c3b650..da74a4e69faa 100644 --- a/tensorflow/inference/buildspec-2-18-ec2.yml +++ b/tensorflow/inference/buildspec-2-18-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION 2.18 arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -47,7 +47,7 @@ images: "-ec2" ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 - # build_tag_override: "true" + build_tag_override: "True" context: <<: *INFERENCE_CONTEXT BuildEC2TensorflowGPUInferencePy3DockerImage: @@ -67,6 +67,6 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 - # build_tag_override: "true" + build_tag_override: "True" context: <<: *INFERENCE_CONTEXT diff --git a/tensorflow/training/buildspec-2-18-ec2.yml b/tensorflow/training/buildspec-2-18-ec2.yml index 5892c9471fda..3fbc94b5c252 100644 --- a/tensorflow/training/buildspec-2-18-ec2.yml +++ b/tensorflow/training/buildspec-2-18-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -41,7 +41,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 enable_test_promotion: true @@ -59,7 +59,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py b/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py index a0e374e6739b..8d8cb0c199e0 100644 --- a/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py +++ b/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py @@ -44,6 +44,8 @@ def test_developer_configuration(): assert config.parse_dlc_developer_configs("test", "ec2_benchmark_tests") is False assert config.parse_dlc_developer_configs("test", "sagemaker_benchmark_tests") is False assert config.parse_dlc_developer_configs("test", "ec2_tests_on_heavy_instances") is False + assert config.parse_dlc_developer_configs("test", "enable_ipv6") is False + assert config.parse_dlc_developer_configs("test", "ipv6_vpc_name") == "" assert config.parse_dlc_developer_configs("test", "nightly_pr_test_mode") is False assert config.parse_dlc_developer_configs("test", "use_scheduler") is False assert config.parse_dlc_developer_configs("test", "safety_check_test") is False @@ -75,6 +77,8 @@ def test_developer_config_wrappers_defaults(): assert config.is_ec2_benchmark_test_enabled() is False assert config.is_sm_benchmark_test_enabled() is False assert config.are_heavy_instance_ec2_tests_enabled() is False + assert config.is_ipv6_test_enabled() is False + assert config.get_ipv6_vpc_name() == "" assert config.is_nightly_pr_test_mode_enabled() is False assert config.is_scheduler_enabled() is False assert config.is_safety_check_test_enabled() is False diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index 5f3a825e74be..01199580f06b 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -3,6 +3,7 @@ import re import logging import sys +import socket import uuid import copy @@ -48,6 +49,11 @@ # List of instance types which are too powerful for minor tests HEAVY_INSTANCE_LIST = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] +# Flag to enable IPv6 testing +ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + +IPV6_VPC_NAME = os.getenv("IPV6_VPC_NAME") + LOGGER = logging.getLogger(__name__) LOGGER.addHandler(logging.StreamHandler(sys.stdout)) @@ -285,6 +291,14 @@ def launch_instance( while reservations: reservation = reservations.pop(0) + if ENABLE_IPV6_TESTING: + ipv6_network_interfaces = try_get_ipv6_network_interface(client, reservation["AvailabilityZone"]) + if not ipv6_network_interfaces: + LOGGER.info(f"Skipping reservation in AZ {reservation['AvailabilityZone']} - no IPv6 subnet available") + continue + else: + arguments_dict["NetworkInterfaces"] = ipv6_network_interfaces + arguments_dict["CapacityReservationSpecification"] = { "CapacityReservationTarget": { "CapacityReservationId": reservation["CapacityReservationId"] @@ -311,6 +325,17 @@ def launch_instance( # Clean up cap reservation if we don't find one arguments_dict.pop("CapacityReservationSpecification", None) LOGGER.info(f"No capacity reservation available for {instance_type}, trying elsewhere...") + + if ENABLE_IPV6_TESTING: + all_azs = get_availability_zone_ids(client) + for az in all_azs: + ipv6_network_interfaces = try_get_ipv6_network_interface(client, az) + if ipv6_network_interfaces: + arguments_dict["NetworkInterfaces"] = ipv6_network_interfaces + break + else: + raise Exception("No AZs available with IPv6 subnets") + response = client.run_instances(**arguments_dict) if not response or len(response["Instances"]) < 1: @@ -319,6 +344,14 @@ def launch_instance( Did not return any response" ) + # quick VPC verification + if ENABLE_IPV6_TESTING: + instance = response["Instances"][0] + actual_vpc_id = instance["VpcId"] + expected_vpc_name = os.getenv("IPV6_VPC_NAME") + expected_vpc_id = get_vpc_id_by_name(client, expected_vpc_name) + LOGGER.info(f"[launch_instance] Standard instance {instance['InstanceId']} VPC check - Expected: {expected_vpc_id} ({expected_vpc_name}), Got: {actual_vpc_id}") + return response["Instances"][0] @@ -351,6 +384,61 @@ def get_available_reservations(ec2_client, instance_type, min_availability=1): return sorted(open_tables, key=lambda res: res["AvailableInstanceCount"]) +def generate_standard_ipv6_network_interface(ec2_client, availability_zone): + """ + Generate network interface configuration for IPv6-enabled instances. + :param ec2_client: boto3 EC2 Client + :param availability_zone: str AZ in which the instance must be created + :return: list containing a single network interface configuration for IPv6 + """ + if not IPV6_VPC_NAME: + LOGGER.error("[generate_standard_ipv6_network_interface] IPv6 VPC name is not set") + raise ValueError("IPv6 VPC name is not set") + + LOGGER.info(f"[generate_standard_ipv6_network_interface] configuring interface for IPv6 VPC: {IPV6_VPC_NAME} in AZ: {availability_zone}") + + ipv6_default_sg = get_default_security_group_id_by_vpc_id(ec2_client, IPV6_VPC_NAME) + LOGGER.info(f"[generate_standard_ipv6_network_interface] got IPv6 default security group: {ipv6_default_sg}") + + ipv6_subnet_id = get_ipv6_enabled_subnet_for_az( + ec2_client, IPV6_VPC_NAME, availability_zone + ) + LOGGER.info(f"[generate_standard_ipv6_network_interface] got IPv6 subnet ID: {ipv6_subnet_id}") + + network_interfaces = [{ + "DeviceIndex": 0, + "DeleteOnTermination": True, + "Groups": [ipv6_default_sg], + "SubnetId": ipv6_subnet_id, + "Ipv6AddressCount": 1, + "AssociatePublicIpAddress": False + }] + + LOGGER.info(f"[generate_standard_ipv6_network_interface] generated network interface config: {network_interfaces}") + return network_interfaces + + +def try_get_ipv6_network_interface(ec2_client, az): + """ + Try to get network interface configuration for IPv6 in specified AZ + :param ec2_client: boto3 EC2 Client object + :param az: string - AZ to check for IPv6 subnet + :return: list containing network interface configuration if IPv6 subnet found in AZ, + None if no IPv6 subnet available + """ + try: + LOGGER.info(f"[try_get_ipv6_network_interface] Found IPv6-enabled subnet in AZ {az}") + network_interfaces = generate_standard_ipv6_network_interface(ec2_client, az) + if network_interfaces: + LOGGER.info(f"[try_get_ipv6_network_interface] Successfully generated IPv6 network interfaces for AZ {az}: {network_interfaces}") + else: + LOGGER.info(f"[try_get_ipv6_network_interface]No IPv6 network interfaces generated for AZ {az}") + return network_interfaces + except Exception as e: + LOGGER.info(f"[try_get_ipv6_network_interface] No IPv6-enabled subnet available in AZ {az}: {str(e)}") + return None + + @retry( reraise=True, stop=stop_after_delay(30 * 60), # Keep retrying for 30 minutes @@ -380,11 +468,21 @@ def launch_instances_with_retry( # Look at available CRs first while reservations: reservation = reservations.pop(0) + + if ENABLE_IPV6_TESTING: + ipv6_network_interfaces = try_get_ipv6_network_interface(ec2_client, reservation["AvailabilityZone"]) + if not ipv6_network_interfaces: + LOGGER.info(f"Skipping reservation in AZ {reservation['AvailabilityZone']} - no IPv6 subnet available") + continue + else: + ec2_create_instances_definition["NetworkInterfaces"] = ipv6_network_interfaces + ec2_create_instances_definition["CapacityReservationSpecification"] = { "CapacityReservationTarget": { "CapacityReservationId": reservation["CapacityReservationId"] } } + try: instances = ec2_resource.create_instances(**ec2_create_instances_definition) LOGGER.info( @@ -392,6 +490,17 @@ def launch_instances_with_retry( ) if is_mainline_context(): LOGGER.info(f"Launched instance for {fn_name} via {reservation}") + + if instances: + instance_ids = [instance.id for instance in instances] + instance_details = ec2_client.describe_instances(InstanceIds=instance_ids) + for reservation in instance_details['Reservations']: + for instance in reservation['Instances']: + LOGGER.info(f"[launch_instances_with_retry] Launched instance {instance['InstanceId']} in VPC {instance['VpcId']}") + if ENABLE_IPV6_TESTING: + expected_vpc_id = get_vpc_id_by_name(ec2_client, IPV6_VPC_NAME) + LOGGER.info(f"[launch_instances_with_retry] VPC check - expected: {expected_vpc_id} ({IPV6_VPC_NAME}), got: {instance['VpcId']}") + return instances except ClientError as e: LOGGER.error(f"Failed to launch via reservation for {fn_name} - {e}") @@ -406,10 +515,29 @@ def launch_instances_with_retry( if availability_zone_options: error = None for a_zone in availability_zone_options: + if ENABLE_IPV6_TESTING: + ipv6_network_interfaces = try_get_ipv6_network_interface(ec2_client, a_zone) + if not ipv6_network_interfaces: + LOGGER.info(f"Skipping AZ {a_zone} - no IPv6 subnet available") + continue + else: + ec2_create_instances_definition["NetworkInterfaces"] = ipv6_network_interfaces + ec2_create_instances_definition["Placement"] = {"AvailabilityZone": a_zone} + try: instances = ec2_resource.create_instances(**ec2_create_instances_definition) if instances: + + instance_ids = [instance.id for instance in instances] + instance_details = ec2_client.describe_instances(InstanceIds=instance_ids) + for reservation in instance_details['Reservations']: + for instance in reservation['Instances']: + LOGGER.info(f"[launch_instances_with_retry] Launched instance {instance['InstanceId']} in VPC {instance['VpcId']}") + if ENABLE_IPV6_TESTING: + expected_vpc_id = get_vpc_id_by_name(ec2_client, IPV6_VPC_NAME) + LOGGER.info(f"[launch_instances_with_retry] VPC check - expected: {expected_vpc_id} ({IPV6_VPC_NAME}), got: {instance['VpcId']}") + break except ClientError as e: LOGGER.error(f"Failed to launch in {a_zone} due to {e} for {fn_name}") @@ -418,7 +546,31 @@ def launch_instances_with_retry( if not instances: raise error else: + if ENABLE_IPV6_TESTING: + all_azs = get_availability_zone_ids(ec2_client) + found_ipv6_az = False + for az in all_azs: + ipv6_network_interfaces = try_get_ipv6_network_interface(ec2_client, az) + if ipv6_network_interfaces: + ec2_create_instances_definition["Placement"] = {"AvailabilityZone": az} + ec2_create_instances_definition["NetworkInterfaces"] = ipv6_network_interfaces + found_ipv6_az = True + break + if not found_ipv6_az: + raise Exception("No AZs available with IPv6 subnets") + instances = ec2_resource.create_instances(**ec2_create_instances_definition) + + if instances: + instance_ids = [instance.id for instance in instances] + instance_details = ec2_client.describe_instances(InstanceIds=instance_ids) + for reservation in instance_details['Reservations']: + for instance in reservation['Instances']: + LOGGER.info(f"[launch_instances_with_retry] Launched instance {instance['InstanceId']} in VPC {instance['VpcId']}") + if ENABLE_IPV6_TESTING: + expected_vpc_id = get_vpc_id_by_name(ec2_client, IPV6_VPC_NAME) + LOGGER.info(f"[launch_instances_with_retry] VPC check - expected: {expected_vpc_id} ({IPV6_VPC_NAME}), got: {instance['VpcId']}") + return instances @@ -433,12 +585,24 @@ def launch_efa(ec2_client, ec2_instance_type, ec2_run_instances_definition, avai } ) response = ec2_client.run_instances(**ec2_efa_run_instances_definition) or {} + + LOGGER.info(f"[launch_efa] Launching EFA instance in AZ: {availability_zone}") + + if response.get("Instances"): + instance_id = response["Instances"][0]["InstanceId"] + vpc_id = response["Instances"][0]["VpcId"] + LOGGER.info(f"[launch_efa] Launched EFA instance {instance_id} in VPC: {vpc_id}") + return response.get("Instances") def launch_efa_with_reservations( ec2_client, ec2_instance_type, reservations, ec2_run_instances_definition, fn_name="" ): + + LOGGER.info("[launch_efa_with_reservations] was called") + + ec2_run_instances_reserved_definition = copy.deepcopy(ec2_run_instances_definition) while reservations: reservation = reservations.pop(0) @@ -520,6 +684,9 @@ def launch_efa_with_heterogenous_reservations(ec2_client, ec2_run_instances_defi Returns: list: launched instances """ + + LOGGER.info("[launch_efa_with_heterogenous_reservations] was called") + ec2_heterogenous_run_instances_definition = copy.deepcopy(ec2_run_instances_definition) ec2_instance_type = ec2_heterogenous_run_instances_definition["InstanceType"] minimum_number_of_instances = ec2_heterogenous_run_instances_definition["MinCount"] @@ -657,6 +824,9 @@ def launch_efa_instances_with_retry( :return: dict response from ec2_client.run_instances """ region = ec2_client.meta.region_name + + LOGGER.info("[launch_efa_instances_with_retry] was called") + LOGGER.info(f"Trying to launch {ec2_instance_type} for {fn_name} via capacity reservation...") heterogenous_reservation_launch = launch_efa_with_heterogenous_reservations( @@ -693,8 +863,12 @@ def launch_efa_instances_with_retry( return instances -def get_ec2_client(region): - return boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) +def get_ec2_client(region=DEFAULT_REGION): + config = Config(retries={"max_attempts": 10}) + if ENABLE_IPV6_TESTING: + endpoint_url = f"https://ec2.{region}.api.aws" + return boto3.client("ec2", region_name=region, endpoint_url=endpoint_url, config=config) + return boto3.client("ec2", region_name=region, config=config) def get_instance_from_id(instance_id, region=DEFAULT_REGION): @@ -739,6 +913,13 @@ def get_public_ip(instance_id, region=DEFAULT_REGION): :return: IP Address of instance with matching instance ID """ instance = get_instance_from_id(instance_id, region) + + if ENABLE_IPV6_TESTING: + if not instance.get("NetworkInterfaces") or not instance["NetworkInterfaces"][0].get("Ipv6Addresses"): + raise Exception("IPv6 address not yet available") + ipv6_addr = instance["NetworkInterfaces"][0]["Ipv6Addresses"][0]["Ipv6Address"] + return ipv6_addr + if not instance["PublicIpAddress"]: raise Exception("IP address not yet available") return instance["PublicIpAddress"] @@ -1011,7 +1192,6 @@ def get_ec2_fabric_connection(instance_id, instance_pem_file, region): ) return conn - def get_ec2_instance_tags(instance_id, region=DEFAULT_REGION, ec2_client=None): ec2_client = ec2_client or get_ec2_client(region) response = ec2_client.describe_tags(Filters=[{"Name": "resource-id", "Values": [instance_id]}]) @@ -1762,6 +1942,9 @@ def get_default_vpc_id(ec2_client): """ response = ec2_client.describe_vpcs(Filters=[{"Name": "is-default", "Values": ["true"]}]) default_vpc_id = response["Vpcs"][0]["VpcId"] + + LOGGER.info(f"[get_default_vpc_id] got default VPC ID: {default_vpc_id}") + return default_vpc_id @@ -1812,6 +1995,111 @@ def get_default_subnet_for_az(ec2_client, availability_zone): return az_subnet_id +def get_vpc_id_by_name(ec2_client, vpc_name): + """ + Get VPC ID by VPC name tag + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str VPC ID of the VPC name + """ + response = ec2_client.describe_vpcs(Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]).get( + "Vpcs", [] + ) + + if not response: + raise Exception(f"No VPC found with Name tag: {vpc_name}") + elif len(response) > 1: + raise Exception(f"Multiple VPCs found with Name tag: {vpc_name}") + + vpc_id = response[0]["VpcId"] + + LOGGER.info(f"[get_vpc_id_by_name] got VPC ID: {vpc_id} for VPC name: {vpc_name}") + + return vpc_id + + +def get_default_security_group_id_by_vpc_id(ec2_client, vpc_name): + """ + Get default SG ID for a non-default VPC + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str SG ID of the default SG + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + response = ec2_client.describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "group-name", "Values": ["default"]}, + ], + ) + + security_group_id = response["SecurityGroups"][0]["GroupId"] + return security_group_id + except Exception as e: + LOGGER.error(f"Error in get_default_security_group_id_by_vpc_id: {str(e)}") + raise + + +def get_ipv6_efa_enabled_security_group_id(ec2_client, vpc_name): + """ + Get EFA-enabled SG ID for IPv6 VPC + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str SG ID of the EFA-enabled SG + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + # get the EFA-enabled SG + response = ec2_client.describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "group-name", "Values": ["EFA-enabled-ipv6"]}, + ], + ) + + efa_security_group_id = response["SecurityGroups"][0]["GroupId"] + return efa_security_group_id + except Exception as e: + LOGGER.error(f"Error in get_ipv6_efa_enabled_security_group_id: {str(e)}") + raise + + +def get_ipv6_enabled_subnet_for_az(ec2_client, vpc_name, availability_zone): + """ + Get IPv6-enabled subnet ID in the a particular availability zone + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :param availability_zone: str AZ name + :return: str Subnet ID of an IPv6-enabled subnet + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + response = ec2_client.describe_subnets( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "availability-zone", "Values": [availability_zone]}, + ] + ) + + ipv6_subnets = [ + subnet for subnet in response["Subnets"] if subnet.get("Ipv6CidrBlockAssociationSet") + ] + + if not ipv6_subnets: + raise Exception( + f"No IPv6-enabled subnet found in AZ {availability_zone} for VPC {vpc_id}" + ) + + return ipv6_subnets[0]["SubnetId"] + except Exception as e: + LOGGER.error(f"Error in get_ipv6_enabled_subnet_for_az: {str(e)}") + raise + + def generate_network_interfaces(ec2_client, ec2_instance_type, availability_zone): """ Generate list of EFA-network-interfaces based on the number of network-interfaces available @@ -1825,21 +2113,39 @@ def generate_network_interfaces(ec2_client, ec2_instance_type, availability_zone if not num_efa_interfaces: raise AttributeError(f"Unable to get number of EFA Interfaces for {ec2_instance_type}") - default_sg = get_default_security_group_id(ec2_client) - efa_sg = get_efa_enabled_security_group_id(ec2_client) - default_subnet_id = get_default_subnet_for_az(ec2_client, availability_zone) + LOGGER.info(f"Generating network interfaces for {ec2_instance_type} in {availability_zone}") - network_interfaces = [ - { + if ENABLE_IPV6_TESTING: + vpc_name = IPV6_VPC_NAME + efa_sg = get_ipv6_efa_enabled_security_group_id(ec2_client, vpc_name) + sg_ids = [efa_sg] + subnet_id = get_ipv6_enabled_subnet_for_az(ec2_client, vpc_name, availability_zone) + else: + default_sg = get_default_security_group_id(ec2_client) + efa_sg = get_efa_enabled_security_group_id(ec2_client) + sg_ids = [default_sg, efa_sg] + subnet_id = get_default_subnet_for_az(ec2_client, availability_zone) + + network_interfaces = [] + for i in range(num_efa_interfaces): + interface = { "DeviceIndex": 0 if i == 0 else 1, "NetworkCardIndex": i, "DeleteOnTermination": True, "InterfaceType": "efa", - "Groups": [default_sg, efa_sg], - "SubnetId": default_subnet_id, + "Groups": sg_ids, + "SubnetId": subnet_id } - for i in range(num_efa_interfaces) - ] + + if ENABLE_IPV6_TESTING: + interface["Ipv6AddressCount"] = 1 + interface["AssociatePublicIpAddress"] = False + + network_interfaces.append(interface) + LOGGER.info(f"Created interface {i}: {interface}") + + LOGGER.info(f"[generate_network_interfaces] Created {len(network_interfaces)} interfaces for {ec2_instance_type} in subnet: {subnet_id}") + return network_interfaces @@ -1883,10 +2189,23 @@ def attach_elastic_ip(network_interface_id, region="us-east-1"): def delete_elastic_ips(elastic_ip_allocation_ids, ec2_client): - """Deletes elastic ips created for efa p4d testing""" + """ + Deletes elastic ips created for efa p4d testing. + For default VPC (IPv4): can release directly + For non-default VPC (IPv6): need to disassociate before release + """ for allocation_id in elastic_ip_allocation_ids: - LOGGER.info(f"Deleting elastic ip {allocation_id}") - ec2_client.release_address(AllocationId=allocation_id) + try: + if ENABLE_IPV6_TESTING: + address = ec2_client.describe_addresses(AllocationIds=[allocation_id])["Addresses"][ + 0 + ] + if "AssociationId" in address: + ec2_client.disassociate_address(AssociationId=address["AssociationId"]) + time.sleep(10) + ec2_client.release_address(AllocationId=allocation_id) + except Exception as e: + LOGGER.error(f"Failed to delete elastic ip {allocation_id}: {str(e)}") def create_name_tags_for_instance(instance_id, name_tag, region): diff --git a/test/testrunner.py b/test/testrunner.py index c46a2c439b65..d0f51c9c4c6f 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -290,6 +290,12 @@ def main(): efa_dedicated = is_efa_dedicated() executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images() + + # Enable IPv6 testing from environment variable + if not executor_mode: + ipv6_enabled = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + os.environ["ENABLE_IPV6_TESTING"] = "true" if ipv6_enabled else "false" + # Executing locally ona can provide commit_id or may ommit it. Assigning default value for local executions: commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", default="unrecognised_commit_id") LOGGER.info(f"Images tested: {dlc_images}")