diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 8efdc90d2983..f641233551b9 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["tensorflow"] # By default we build both training and inference containers. Set true/false values to determine which to build. @@ -71,13 +71,16 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true +### Set enable_ipv6 = true to run tests with IPv6-enabled resources +### Off by default (set to false) +enable_ipv6 = true ### SM specific tests ### On by default sagemaker_local_tests = true # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = true +sagemaker_remote_tests = false # run efa sagemaker tests sagemaker_efa_tests = false # run release_candidate_integration tests @@ -105,7 +108,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training @@ -135,7 +138,7 @@ dlc-pr-tensorflow-2-habana-training = "" # Standard Framework Inference dlc-pr-pytorch-inference = "" -dlc-pr-tensorflow-2-inference = "" +dlc-pr-tensorflow-2-inference = "tensorflow/inference/buildspec-2-18-sm.yml" dlc-pr-autogluon-inference = "" # Graviton Inference diff --git a/pytorch/inference/buildspec-2-4-ec2.yml b/pytorch/inference/buildspec-2-4-ec2.yml index 1118c3110690..d2bad7c45ca5 100644 --- a/pytorch/inference/buildspec-2-4-ec2.yml +++ b/pytorch/inference/buildspec-2-4-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -44,6 +44,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.4.0-cpu-py311-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -61,6 +62,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: diff --git a/pytorch/inference/buildspec-2-4-sm.yml b/pytorch/inference/buildspec-2-4-sm.yml index 8fb0b8260a42..710b0b122d97 100644 --- a/pytorch/inference/buildspec-2-4-sm.yml +++ b/pytorch/inference/buildspec-2-4-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -45,6 +45,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -63,6 +64,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/inference/buildspec-2-5-ec2.yml b/pytorch/inference/buildspec-2-5-ec2.yml index a34fb55735f8..8cd21091ec07 100644 --- a/pytorch/inference/buildspec-2-5-ec2.yml +++ b/pytorch/inference/buildspec-2-5-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -44,6 +44,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.5.0-cpu-py311-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -61,6 +62,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.5.0-gpu-py311-cu124-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: diff --git a/pytorch/inference/buildspec-2-5-sm.yml b/pytorch/inference/buildspec-2-5-sm.yml index 0bba3d6138f4..cdf12138a82e 100644 --- a/pytorch/inference/buildspec-2-5-sm.yml +++ b/pytorch/inference/buildspec-2-5-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -45,6 +45,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -63,6 +64,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/inference/buildspec-2-6-ec2.yml b/pytorch/inference/buildspec-2-6-ec2.yml index dc825393378d..07e180e2e678 100644 --- a/pytorch/inference/buildspec-2-6-ec2.yml +++ b/pytorch/inference/buildspec-2-6-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -44,6 +44,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "False" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -61,6 +62,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "False" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: diff --git a/pytorch/inference/buildspec-2-6-sm.yml b/pytorch/inference/buildspec-2-6-sm.yml index 3b1135a26856..1ba01ab613db 100644 --- a/pytorch/inference/buildspec-2-6-sm.yml +++ b/pytorch/inference/buildspec-2-6-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -48,6 +48,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -66,6 +67,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/training/buildspec-2-4-ec2.yml b/pytorch/training/buildspec-2-4-ec2.yml index 88bbcb846f8f..9200fdb56c94 100644 --- a/pytorch/training/buildspec-2-4-ec2.yml +++ b/pytorch/training/buildspec-2-4-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -42,7 +42,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -58,7 +58,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/pytorch/training/buildspec-2-4-sm.yml b/pytorch/training/buildspec-2-4-sm.yml index 994115f4029c..e5c9826b7385 100644 --- a/pytorch/training/buildspec-2-4-sm.yml +++ b/pytorch/training/buildspec-2-4-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -43,6 +43,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -59,6 +60,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/training/buildspec-2-5-ec2.yml b/pytorch/training/buildspec-2-5-ec2.yml index b2d6d8a7edeb..59b5d2d22694 100644 --- a/pytorch/training/buildspec-2-5-ec2.yml +++ b/pytorch/training/buildspec-2-5-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -43,6 +43,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.5.1-cpu-py311-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -59,6 +60,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] # build_tag_override: "beta:2.5.1-gpu-py311-cu121-ubuntu22.04-ec2" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/pytorch/training/buildspec-2-5-sm.yml b/pytorch/training/buildspec-2-5-sm.yml index e3b305db5738..5f32b6835ba9 100644 --- a/pytorch/training/buildspec-2-5-sm.yml +++ b/pytorch/training/buildspec-2-5-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -49,6 +49,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.1-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -65,6 +66,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] # build_tag_override: "beta:2.5.1-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/pytorch/training/buildspec-2-6-ec2.yml b/pytorch/training/buildspec-2-6-ec2.yml index 6e55dba3a167..402c1879ca0c 100644 --- a/pytorch/training/buildspec-2-6-ec2.yml +++ b/pytorch/training/buildspec-2-6-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -48,7 +48,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -64,7 +64,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/pytorch/training/buildspec-2-6-sm.yml b/pytorch/training/buildspec-2-6-sm.yml index 2e2199ec48be..cc7477d83157 100644 --- a/pytorch/training/buildspec-2-6-sm.yml +++ b/pytorch/training/buildspec-2-6-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.6.0 short_version: &SHORT_VERSION "2.6" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -48,7 +48,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -64,7 +64,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # build_tag_override: "beta:2.6.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/src/config.py b/src/config.py index a485b402f14b..b44253450010 100644 --- a/src/config.py +++ b/src/config.py @@ -75,6 +75,10 @@ def are_heavy_instance_ec2_tests_enabled(): return parse_dlc_developer_configs("test", "ec2_tests_on_heavy_instances") +def is_ipv6_test_enabled(): + return parse_dlc_developer_configs("test", "enable_ipv6") + + def is_ecs_test_enabled(): return parse_dlc_developer_configs("test", "ecs_tests") diff --git a/src/start_testbuilds.py b/src/start_testbuilds.py index c1540d8e8541..dad1606b7e1a 100644 --- a/src/start_testbuilds.py +++ b/src/start_testbuilds.py @@ -51,6 +51,9 @@ def run_test_job(commit, codebuild_project, images_str=""): config.are_heavy_instance_ec2_tests_enabled() and "ec2" in codebuild_project ) + # For EC2 tests, enable IPv6 testing when config is enabled + is_ipv6_test_enabled = config.is_ipv6_test_enabled() and "ec2" in codebuild_project + if config.is_deep_canary_mode_enabled(): env_overrides.append({"name": "DEEP_CANARY_MODE", "value": "true", "type": "PLAINTEXT"}) @@ -90,6 +93,11 @@ def run_test_job(commit, codebuild_project, images_str=""): "value": str(are_heavy_instance_ec2_tests_enabled), "type": "PLAINTEXT", }, + { + "name": "ENABLE_IPV6_TESTING", + "value": str(is_ipv6_test_enabled), + "type": "PLAINTEXT", + }, { "name": "FRAMEWORK_BUILDSPEC_FILE", "value": config.get_buildspec_override() or os.getenv("FRAMEWORK_BUILDSPEC_FILE"), diff --git a/tensorflow/inference/buildspec-2-18-ec2.yml b/tensorflow/inference/buildspec-2-18-ec2.yml index 983fdc0b710a..020aef8b58f0 100644 --- a/tensorflow/inference/buildspec-2-18-ec2.yml +++ b/tensorflow/inference/buildspec-2-18-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION 2.18 arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY @@ -48,6 +48,7 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 # build_tag_override: "beta:2.16.1-cpu-py310-ubuntu20.04-ec2" + build_tag_override: "True" context: <<: *INFERENCE_CONTEXT BuildEC2TensorflowGPUInferencePy3DockerImage: @@ -68,5 +69,6 @@ images: *DEVICE_TYPE ] target: ec2 # build_tag_override: "beta:2.16.1-gpu-py310-cu122-ubuntu20.04-ec2" + build_tag_override: "True" context: <<: *INFERENCE_CONTEXT diff --git a/tensorflow/training/buildspec-2-18-ec2.yml b/tensorflow/training/buildspec-2-18-ec2.yml index 05271d91adc7..5f368f20d701 100644 --- a/tensorflow/training/buildspec-2-18-ec2.yml +++ b/tensorflow/training/buildspec-2-18-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -38,7 +38,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 enable_test_promotion: true @@ -56,7 +56,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "True" + build_tag_override: "True" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py b/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py index a0e374e6739b..cf2ff7d032d6 100644 --- a/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py +++ b/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py @@ -44,6 +44,7 @@ def test_developer_configuration(): assert config.parse_dlc_developer_configs("test", "ec2_benchmark_tests") is False assert config.parse_dlc_developer_configs("test", "sagemaker_benchmark_tests") is False assert config.parse_dlc_developer_configs("test", "ec2_tests_on_heavy_instances") is False + assert config.parse_dlc_developer_configs("test", "enable_ipv6") is False assert config.parse_dlc_developer_configs("test", "nightly_pr_test_mode") is False assert config.parse_dlc_developer_configs("test", "use_scheduler") is False assert config.parse_dlc_developer_configs("test", "safety_check_test") is False @@ -75,6 +76,7 @@ def test_developer_config_wrappers_defaults(): assert config.is_ec2_benchmark_test_enabled() is False assert config.is_sm_benchmark_test_enabled() is False assert config.are_heavy_instance_ec2_tests_enabled() is False + assert config.is_ipv6_test_enabled() is False assert config.is_nightly_pr_test_mode_enabled() is False assert config.is_scheduler_enabled() is False assert config.is_safety_check_test_enabled() is False diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index 2968b3204d76..ef2302fb5d27 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -48,6 +48,9 @@ # List of instance types which are too powerful for minor tests HEAVY_INSTANCE_LIST = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] +# Flag to enable IPv6 testing +ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + LOGGER = logging.getLogger(__name__) LOGGER.addHandler(logging.StreamHandler(sys.stdout)) @@ -1715,6 +1718,108 @@ def get_default_subnet_for_az(ec2_client, availability_zone): return az_subnet_id +def get_vpc_id_by_name(ec2_client, vpc_name): + """ + Get VPC ID by VPC name tag + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str VPC ID of the VPC name + """ + response = ec2_client.describe_vpcs(Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]).get( + "Vpcs", [] + ) + + if not response: + raise Exception(f"No VPC found with Name tag: {vpc_name}") + elif len(response) > 1: + raise Exception(f"Multiple VPCs found with Name tag: {vpc_name}") + + vpc_id = response[0]["VpcId"] + return vpc_id + + +def get_default_security_group_id_by_vpc_id(ec2_client, vpc_name): + """ + Get default SG ID for a non-default VPC + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str SG ID of the default SG + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + response = ec2_client.describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "group-name", "Values": ["default"]}, + ], + ) + + security_group_id = response["SecurityGroups"][0]["GroupId"] + return security_group_id + except Exception as e: + LOGGER.error(f"Error in get_default_security_group_id_by_vpc_id: {str(e)}") + raise + + +def get_ipv6_efa_enabled_security_group_id(ec2_client, vpc_name): + """ + Get EFA-enabled SG ID for IPv6 VPC + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str SG ID of the EFA-enabled SG + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + # get the EFA-enabled SG + response = ec2_client.describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "group-name", "Values": ["EFA-enabled-ipv6"]}, + ], + ) + + efa_security_group_id = response["SecurityGroups"][0]["GroupId"] + return efa_security_group_id + except Exception as e: + LOGGER.error(f"Error in get_ipv6_efa_enabled_security_group_id: {str(e)}") + raise + + +def get_ipv6_enabled_subnet_for_az(ec2_client, vpc_name, availability_zone): + """ + Get IPv6-enabled subnet ID in the a particular availability zone + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :param availability_zone: str AZ name + :return: str Subnet ID of an IPv6-enabled subnet + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + response = ec2_client.describe_subnets( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "availability-zone", "Values": [availability_zone]}, + ] + ) + + ipv6_subnets = [ + subnet for subnet in response["Subnets"] if subnet.get("Ipv6CidrBlockAssociationSet") + ] + + if not ipv6_subnets: + raise Exception( + f"No IPv6-enabled subnet found in AZ {availability_zone} for VPC {vpc_id}" + ) + + return ipv6_subnets[0]["SubnetId"] + except Exception as e: + LOGGER.error(f"Error in get_ipv6_enabled_subnet_for_az: {str(e)}") + raise + + def generate_network_interfaces(ec2_client, ec2_instance_type, availability_zone): """ Generate list of EFA-network-interfaces based on the number of network-interfaces available @@ -1728,6 +1833,31 @@ def generate_network_interfaces(ec2_client, ec2_instance_type, availability_zone if not num_efa_interfaces: raise AttributeError(f"Unable to get number of EFA Interfaces for {ec2_instance_type}") + enable_ipv6 = os.environ.get("ENABLE_IPV6_TESTING", "false").lower() == "true" + + # TODO: remove hardcoded vpc name for testing + ipv6_vpc_name = "dlc-ipv6-test-vpc" + + if enable_ipv6: + ipv6_default_sg = get_default_security_group_id_by_vpc_id(ec2_client, ipv6_vpc_name) + ipv6_efa_sg = get_ipv6_efa_enabled_security_group_id(ec2_client, ipv6_vpc_name) + ipv6_subnet_id = get_ipv6_enabled_subnet_for_az( + ec2_client, ipv6_vpc_name, availability_zone + ) + + network_interfaces = [ + { + "DeviceIndex": 0 if i == 0 else 1, + "NetworkCardIndex": i, + "DeleteOnTermination": True, + "InterfaceType": "efa", + "Groups": [ipv6_default_sg, ipv6_efa_sg], + "SubnetId": ipv6_subnet_id, + } + for i in range(num_efa_interfaces) + ] + return network_interfaces + default_sg = get_default_security_group_id(ec2_client) efa_sg = get_efa_enabled_security_group_id(ec2_client) default_subnet_id = get_default_subnet_for_az(ec2_client, availability_zone) @@ -1786,10 +1916,24 @@ def attach_elastic_ip(network_interface_id, region="us-east-1"): def delete_elastic_ips(elastic_ip_allocation_ids, ec2_client): - """Deletes elastic ips created for efa p4d testing""" + """ + Deletes elastic ips created for efa p4d testing. + For default VPC (IPv4): can release directly + For non-default VPC (IPv6): need to disassociate before release + """ for allocation_id in elastic_ip_allocation_ids: - LOGGER.info(f"Deleting elastic ip {allocation_id}") - ec2_client.release_address(AllocationId=allocation_id) + try: + enable_ipv6 = os.environ.get("ENABLE_IPV6_TESTING", "false").lower() == "true" + if enable_ipv6: + address = ec2_client.describe_addresses(AllocationIds=[allocation_id])["Addresses"][ + 0 + ] + if "AssociationId" in address: + ec2_client.disassociate_address(AssociationId=address["AssociationId"]) + time.sleep(10) + ec2_client.release_address(AllocationId=allocation_id) + except Exception as e: + LOGGER.error(f"Failed to delete elastic ip {allocation_id}: {str(e)}") def create_name_tags_for_instance(instance_id, name_tag, region): diff --git a/test/testrunner.py b/test/testrunner.py index 235c5538af17..95ec4225ae82 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -290,6 +290,13 @@ def main(): efa_dedicated = is_efa_dedicated() executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images() + + # Enable IPv6 testing from environment variable + if not executor_mode: + ipv6_enabled = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + LOGGER.info(f"IPv6 Testing enabled: {ipv6_enabled}") + os.environ["ENABLE_IPV6_TESTING"] = "true" if ipv6_enabled else "false" + # Executing locally ona can provide commit_id or may ommit it. Assigning default value for local executions: commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", default="unrecognised_commit_id") LOGGER.info(f"Images tested: {dlc_images}")