diff --git a/miscellaneous_scripts/bash_telemetry.sh b/miscellaneous_scripts/bash_telemetry.sh new file mode 100644 index 000000000000..c0f1d7c14275 --- /dev/null +++ b/miscellaneous_scripts/bash_telemetry.sh @@ -0,0 +1,12 @@ +# telemetry.sh +#!/bin/bash +if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then + ( + python /usr/local/bin/deep_learning_container.py \ + --framework "${FRAMEWORK}" \ + --framework-version "${FRAMEWORK_VERSION}" \ + --container-type "${CONTAINER_TYPE}" \ + &>/dev/null & + ) +fi + diff --git a/pytorch/inference/docker/build_artifacts/torchserve-ec2-entrypoint.py b/pytorch/inference/docker/build_artifacts/torchserve-ec2-entrypoint.py index ba8efcade9b7..98614996dc19 100644 --- a/pytorch/inference/docker/build_artifacts/torchserve-ec2-entrypoint.py +++ b/pytorch/inference/docker/build_artifacts/torchserve-ec2-entrypoint.py @@ -17,6 +17,16 @@ import sys +try: + subprocess.call( + ["/bin/bash", "/usr/local/bin/bash_telemetry.sh"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) +except: + pass + + subprocess.check_call(shlex.split(" ".join(sys.argv[1:]))) # prevent docker exit diff --git a/pytorch/inference/docker/build_artifacts/torchserve-entrypoint.py b/pytorch/inference/docker/build_artifacts/torchserve-entrypoint.py index 134cd56f8fe8..178829f1f76a 100644 --- a/pytorch/inference/docker/build_artifacts/torchserve-entrypoint.py +++ b/pytorch/inference/docker/build_artifacts/torchserve-entrypoint.py @@ -19,6 +19,15 @@ import signal import torch +try: + subprocess.call( + ["/bin/bash", "/usr/local/bin/bash_telemetry.sh"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) +except: + pass + if torch.cuda.is_available(): # run compat mounting by default try: diff --git a/pytorch/training/buildspec-2-5-ec2.yml b/pytorch/training/buildspec-2-5-ec2.yml index b2d6d8a7edeb..39438d1ed034 100644 --- a/pytorch/training/buildspec-2-5-ec2.yml +++ b/pytorch/training/buildspec-2-5-ec2.yml @@ -42,7 +42,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "beta:2.5.1-cpu-py311-ubuntu22.04-ec2" + # build_tag_override: "true" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -58,7 +58,7 @@ images: os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # build_tag_override: "beta:2.5.1-gpu-py311-cu121-ubuntu22.04-ec2" + # build_tag_override: "true" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/pytorch/training/docker/build_artifacts/dockerd_entrypoint.sh b/pytorch/training/docker/build_artifacts/dockerd_entrypoint.sh index e89b3bba31a7..beb048a386bf 100644 --- a/pytorch/training/docker/build_artifacts/dockerd_entrypoint.sh +++ b/pytorch/training/docker/build_artifacts/dockerd_entrypoint.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash +# Check if telemetry file exists before executing +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true CUDA_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())") if [ "$CUDA_AVAILABLE" = "True" ]; then diff --git a/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh b/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh index 1f81528f87b4..fedaf5ec8845 100644 --- a/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh +++ b/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# telemetry.sh +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + + CUDA_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())") if [ "$CUDA_AVAILABLE" = "True" ]; then bash /usr/local/bin/start_cuda_compat.sh diff --git a/src/image_builder.py b/src/image_builder.py index 2c6e856d0a00..e1b4d4fe7355 100644 --- a/src/image_builder.py +++ b/src/image_builder.py @@ -255,9 +255,12 @@ def image_builder(buildspec, image_types=[], device_types=[]): f"This is required to set job_type label." ) - template_file = os.path.join( + sitecustomize_template_file = os.path.join( os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "dlc_template.py" ) + bash_template_file = os.path.join( + os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "bash_telemetry.sh" + ) template_fw_version = ( str(image_config["framework_version"]) @@ -265,16 +268,29 @@ def image_builder(buildspec, image_types=[], device_types=[]): else str(BUILDSPEC["version"]) ) template_fw = str(BUILDSPEC["framework"]) - post_template_file = utils.generate_dlc_cmd( - template_path=template_file, + sitecustomize_post_template_file = utils.generate_dlc_cmd( + template_path=sitecustomize_template_file, output_path=os.path.join(image_config["root"], "out.py"), framework=template_fw, framework_version=template_fw_version, container_type=label_job_type, ) + bash_post_template_file = utils.generate_dlc_cmd( + template_path=bash_template_file, + output_path=os.path.join(image_config["root"], "telemetry.sh"), + framework=template_fw, + framework_version=template_fw_version, + container_type=label_job_type, + ) ARTIFACTS.update( - {"customize": {"source": post_template_file, "target": "sitecustomize.py"}} + { + "customize": { + "source": sitecustomize_post_template_file, + "target": "sitecustomize.py", + }, + "bash": {"source": bash_post_template_file, "target": "bash_telemetry.sh"}, + } ) context = Context(ARTIFACTS, f"build/{image_name}.tar.gz", image_config["root"]) diff --git a/src/utils.py b/src/utils.py index f2a0d73a5cc8..9d8d2b6ec079 100644 --- a/src/utils.py +++ b/src/utils.py @@ -688,7 +688,8 @@ def generate_dlc_cmd(template_path, output_path, framework, framework_version, c } for anchor, value in replacements.items(): - content = content.replace(f"{{{anchor}}}", value) + content = content.replace(f"${{{anchor}}}", value) # replace ${VARIABLE} with value + content = content.replace(f"{{{anchor}}}", value) # replace {VARIABLE} with value with open(output_path, "w") as out_f: out_f.write(content) diff --git a/tensorflow/inference/buildspec-2-18-ec2.yml b/tensorflow/inference/buildspec-2-18-ec2.yml index 983fdc0b710a..6607c4c3b650 100644 --- a/tensorflow/inference/buildspec-2-18-ec2.yml +++ b/tensorflow/inference/buildspec-2-18-ec2.yml @@ -47,7 +47,7 @@ images: "-ec2" ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 - # build_tag_override: "beta:2.16.1-cpu-py310-ubuntu20.04-ec2" + # build_tag_override: "true" context: <<: *INFERENCE_CONTEXT BuildEC2TensorflowGPUInferencePy3DockerImage: @@ -67,6 +67,6 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 - # build_tag_override: "beta:2.16.1-gpu-py310-cu122-ubuntu20.04-ec2" + # build_tag_override: "true" context: <<: *INFERENCE_CONTEXT diff --git a/tensorflow/inference/buildspec-arm64-2-18-ec2.yml b/tensorflow/inference/buildspec-arm64-2-18-ec2.yml index b25a8229e84f..3393a8deca97 100644 --- a/tensorflow/inference/buildspec-arm64-2-18-ec2.yml +++ b/tensorflow/inference/buildspec-arm64-2-18-ec2.yml @@ -45,6 +45,6 @@ images: latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.arm64., *DEVICE_TYPE ] target: ec2 - # build_tag_override: "beta:2.16.1-cpu-py310-ubuntu20.04-ec2" + # build_tag_override: "true" context: <<: *INFERENCE_CONTEXT diff --git a/tensorflow/inference/docker/build_artifacts/dockerd_entrypoint.sh b/tensorflow/inference/docker/build_artifacts/dockerd_entrypoint.sh index d5564db3a423..341ba60575a3 100644 --- a/tensorflow/inference/docker/build_artifacts/dockerd_entrypoint.sh +++ b/tensorflow/inference/docker/build_artifacts/dockerd_entrypoint.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true TF_SERVING_PACKAGE=$(pip list | grep tensorflow-serving | cut -d ' ' -f 1) diff --git a/tensorflow/training/buildspec-2-18-ec2.yml b/tensorflow/training/buildspec-2-18-ec2.yml index 05271d91adc7..5892c9471fda 100644 --- a/tensorflow/training/buildspec-2-18-ec2.yml +++ b/tensorflow/training/buildspec-2-18-ec2.yml @@ -22,6 +22,9 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py + dockerd_ec2_entrypoint: + source: docker/build_artifacts/dockerd_ec2_entrypoint.sh + target: dockerd_ec2_entrypoint.sh deep_learning_container: source: ../../src/deep_learning_container.py target: deep_learning_container.py diff --git a/tensorflow/training/buildspec-2-18-sm.yml b/tensorflow/training/buildspec-2-18-sm.yml index 3249b0cf834a..86c681ed4a47 100644 --- a/tensorflow/training/buildspec-2-18-sm.yml +++ b/tensorflow/training/buildspec-2-18-sm.yml @@ -25,6 +25,9 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py + dockerd_ec2_entrypoint: + source: docker/build_artifacts/dockerd_ec2_entrypoint.sh + target: dockerd_ec2_entrypoint.sh deep_learning_container: source: ../../src/deep_learning_container.py target: deep_learning_container.py diff --git a/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py b/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py index b58f1e054b47..73d3b8e7cb52 100644 --- a/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py +++ b/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py @@ -18,6 +18,16 @@ import sys import tensorflow as tf +try: + subprocess.call( + ["/bin/bash", "/usr/local/bin/bash_telemetry.sh"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) +except: + pass + + if tf.test.is_built_with_cuda(): # run compat mounting by default try: @@ -25,7 +35,4 @@ except Exception as e: print(f"Error running script: {e}") -if not os.path.exists("/opt/ml/input/config"): - subprocess.call(["python", "/usr/local/bin/deep_learning_container.py", "&>/dev/null", "&"]) - subprocess.check_call(shlex.split(" ".join(sys.argv[1:]))) diff --git a/tensorflow/training/docker/build_artifacts/dockerd_ec2_entrypoint.sh b/tensorflow/training/docker/build_artifacts/dockerd_ec2_entrypoint.sh new file mode 100644 index 000000000000..dc792b95fbf7 --- /dev/null +++ b/tensorflow/training/docker/build_artifacts/dockerd_ec2_entrypoint.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + +eval "$@" \ No newline at end of file diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index a58f3b6b0e6e..8df41717ccb0 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -144,6 +144,13 @@ "feature_s3_plugin_present": {NightlyFeatureLabel.AWS_S3_PLUGIN_INSTALLED.value}, } +# Skip telemetry tests for specific versions +TELEMETRY_SKIP_VERSIONS = { + "entrypoint": {"pytorch": ["2.4.0", "2.5.1", "2.6.0"], "tensorflow": ["2.18.0"]}, + "bashrc": {"pytorch": ["2.4.0", "2.5.1", "2.6.0"], "tensorflow": ["2.18.0"]}, + "framework": {"pytorch": [""], "tensorflow": [""]}, +} + # Nightly fixtures @pytest.fixture(scope="session") @@ -1004,6 +1011,56 @@ def skip_serialized_release_pt_test(request): ) +@pytest.fixture(autouse=True) +def skip_telemetry_tests(request): + """Skip specific telemetry tests based on test name and image version""" + test_name = request.node.name.lower() + + if "telemetry_entrypoint" in test_name: + _check_telemetry_skip(request, "entrypoint") + elif "telemetry_bashrc" in test_name: + _check_telemetry_skip(request, "bashrc") + elif "telemetry_framework" in test_name: + _check_telemetry_skip(request, "framework") + + +def _get_telemetry_image_info(request): + """Helper function to get image URI and framework info from fixtures.""" + telemetry_framework_fixtures = [ + "pytorch_training", + "tensorflow_training", + "tensorflow_inference", + "pytorch_inference", + "pytorch_inference_arm64", + "pytorch_training_arm64", + "tensorflow_inference_arm64", + ] + + for fixture_name in telemetry_framework_fixtures: + if fixture_name in request.fixturenames: + img_uri = request.getfixturevalue(fixture_name) + image_framework, image_framework_version = get_framework_and_version_from_tag(img_uri) + return image_framework, image_framework_version + return None, None + + +def _check_telemetry_skip(request, test_type): + """Common logic for skipping telemetry tests.""" + if test_type not in TELEMETRY_SKIP_VERSIONS: + return + image_framework, image_framework_version = _get_telemetry_image_info(request) + if not image_framework: + return + if image_framework not in TELEMETRY_SKIP_VERSIONS[test_type]: + return + + if image_framework_version in TELEMETRY_SKIP_VERSIONS[test_type][image_framework]: + pytest.skip( + f"Telemetry {test_type} test is not supported for " + f"{image_framework} version {image_framework_version}" + ) + + def _validate_pytorch_framework_version(request, image_uri, test_name, skip_dict): """ Expected format of skip_dic: diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test b/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test deleted file mode 100644 index f83447be058b..000000000000 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -HOME_DIR=/test -BIN_DIR=${HOME_DIR}/bin -LOG_DIR=${HOME_DIR}/logs -TRAINING_LOG=${LOG_DIR}/pytorch_telemetry_test.log -set -e - -echo "Verify pytorch telemetry test. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG - -python ${BIN_DIR}/pytorch_tests/test_pt_dlc_telemetry_test.py 2>&1 | tee $TRAINING_LOG - -if grep "DLC Telemetry performance test Passed" $TRAINING_LOG; then - echo "Successfully verified Telemetry performance test." -else - echo "Telemetry performance test failed." - cat ${TRAINING_LOG} - exit 1 -fi - -if grep "Opt-In/Opt-Out Test passed" $TRAINING_LOG; then - echo "Successfully verified Opt-In/Opt-Out Test " -else - echo "Opt-In/Opt-Out Test failed." - cat ${TRAINING_LOG} - exit 1 -fi - -exit 0 diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test.py b/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test.py deleted file mode 100644 index 8dc7d000b27d..000000000000 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import numpy as np -import time -from packaging.version import Version - - -def _clean_up_reports(): - if os.path.exists("/tmp/test_request.txt"): - os.system("rm /tmp/test_request.txt") - if os.path.exists("/tmp/test_tag_request.txt"): - os.system("rm /tmp/test_tag_request.txt") - - -def opt_in_opt_out_test(exec_cmd): - os.environ["TEST_MODE"] = "1" - - for opt_out_value in ["True", "TRUE", "true"]: - _clean_up_reports() - os.environ["OPT_OUT_TRACKING"] = opt_out_value - cmd = f"python -c '{exec_cmd}'" - os.system(cmd) - time.sleep(5) - assert not os.path.exists( - "/tmp/test_request.txt" - ), f"URL request placed even though OPT_OUT_TRACKING is {opt_out_value}." - assert not os.path.exists( - "/tmp/test_tag_request.txt" - ), f"Tag request placed even though OPT_OUT_TRACKING is {opt_out_value}." - - for opt_out_value in ["False", "XYgg"]: - _clean_up_reports() - os.environ["OPT_OUT_TRACKING"] = opt_out_value - cmd = f"python -c '{exec_cmd}'" - os.system(cmd) - time.sleep(5) - assert os.path.exists( - "/tmp/test_request.txt" - ), f"URL request not placed even though OPT_OUT_TRACKING is {opt_out_value}." - assert os.path.exists( - "/tmp/test_tag_request.txt" - ), f"Tag request not placed even though OPT_OUT_TRACKING is {opt_out_value}." - - os.environ["TEST_MODE"] = "0" - print("Opt-In/Opt-Out Test passed") - - -def perf_test(exec_cmd): - os.environ["TEST_MODE"] = "0" - os.environ["OPT_OUT_TRACKING"] = "False" - NUM_ITERATIONS = 5 - - for itr in range(NUM_ITERATIONS): - total_time_in = 0 - for x in range(NUM_ITERATIONS): - cmd = f"python -c '{exec_cmd}'" - start = time.time() - os.system(cmd) - total_time_in += time.time() - start - print("avg out time: ", total_time_in / NUM_ITERATIONS) - - total_time_out = 0 - for x in range(NUM_ITERATIONS): - cmd = f"export OPT_OUT_TRACKING='true' && python -c '{exec_cmd}'" - start = time.time() - os.system(cmd) - total_time_out += time.time() - start - print("avg out time: ", total_time_out / NUM_ITERATIONS) - - np.testing.assert_allclose( - total_time_in / NUM_ITERATIONS, total_time_out / NUM_ITERATIONS, rtol=0.2, atol=0.5 - ) - - print("DLC Telemetry performance test Passed") - - -perf_test("import torch") -opt_in_opt_out_test("import torch") - -try: - import torch - - torch_version = torch.__version__ -except ImportError: - raise ImportError("PyTorch is not installed or cannot be imported.") - -# TEMP: sitecustomize.py current exists in PyTorch 2.6 DLCs. Skip logic should be reverted once sitecustomize.py has been added to all DLCs -if Version(torch_version) >= Version("2.6"): - print("PyTorch version is 2.6 or higher. Running OS tests...") - perf_test("import os") - opt_in_opt_out_test("import os") - print("OS tests completed.") -else: - print( - "TEMP: sitecustomize.py current exists in PyTorch 2.6 DLCs. Skip logic should be reverted once sitecustomize.py has been added to all DLCs" - ) - print("PyTorch version is below 2.6. Skipping OS tests.") - -print("All DLC telemetry test passed") diff --git a/test/dlc_tests/container_tests/bin/testTelemetry b/test/dlc_tests/container_tests/bin/testTelemetry new file mode 100644 index 000000000000..809670c99901 --- /dev/null +++ b/test/dlc_tests/container_tests/bin/testTelemetry @@ -0,0 +1,113 @@ +#!/bin/bash +HOME_DIR=/test +BIN_DIR=${HOME_DIR}/bin +LOG_DIR=${HOME_DIR}/logs +TRAINING_LOG=${LOG_DIR}/telemetry_test.log +set -e + +mkdir -p ${LOG_DIR} + +verify_test_results() { + local log_file=$1 + + if grep "DLC Telemetry performance test Passed" "$log_file"; then + echo "Successfully verified Telemetry performance test." + else + echo "Telemetry performance test failed." + # cat "$log_file" + return 1 + fi + + if grep "Opt-In/Opt-Out Test passed" "$log_file"; then + echo "Successfully verified Opt-In/Opt-Out Test " + else + echo "Opt-In/Opt-Out Test failed." + # cat "$log_file" + return 1 + fi + + return 0 +} + +check_opt_in_results() { + echo "Checking opt-in telemetry..." | tee -a "$TRAINING_LOG" + sleep 30 + if [ -f /tmp/test_request.txt ]; then + echo "Found test_request.txt file - opt-in working correctly" | tee -a "$TRAINING_LOG" + echo "Content of test_request.txt:" | tee -a "$TRAINING_LOG" + cat /tmp/test_request.txt | tee -a "$TRAINING_LOG" + return 0 + else + echo "Error: test_request.txt file not found - opt-in failed" | tee -a "$TRAINING_LOG" + return 1 + fi +} + +check_opt_out_results() { + echo "Checking opt-out telemetry..." | tee -a "$TRAINING_LOG" + sleep 30 + if [ ! -f /tmp/test_request.txt ]; then + echo "No test_request.txt file found - opt-out working correctly" | tee -a "$TRAINING_LOG" + return 0 + else + echo "Error: test_request.txt file found - opt-out failed" | tee -a "$TRAINING_LOG" + return 1 + fi +} + +# Check if parameter is provided +if [ $# -lt 1 ]; then + echo "Error: No parameter provided" + echo "Usage: $0 [opt_in|opt_out]" + echo "Note: bashrc and entrypoint require opt_in or opt_out parameter" + echo "Available call types: bashrc, entrypoint, framework" + echo "Optional test_type: opt_in, opt_out" + exit 1 +fi + +CALL_TYPE=$1 +echo "Call type: $CALL_TYPE" | tee -a $TRAINING_LOG + +echo "Verify telemetry test for $CALL_TYPE. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG + +case $CALL_TYPE in + bashrc | entrypoint) + # Check if opt_in or opt_out parameter is provided + if [ $# -lt 2 ]; then + echo "Error: $CALL_TYPE requires opt_in or opt_out parameter" + echo "Usage: $0 $CALL_TYPE " + exit 1 + fi + + TEST_TYPE=$2 + + echo "Test type: $TEST_TYPE" | tee -a $TRAINING_LOG + + case "$TEST_TYPE" in + "opt_in") + echo "Test ${CALL_TYPE} telemetry opt_in" | tee -a $TRAINING_LOG + check_opt_in_results || exit 1 + ;; + "opt_out") + echo "Test ${CALL_TYPE} telemetry opt_out" | tee -a $TRAINING_LOG + check_opt_out_results || exit 1 + ;; + *) + echo "Error: Invalid test type. Must be 'opt_in' or 'opt_out'" + exit 1 + ;; + esac + ;; + framework) + python ${BIN_DIR}/testTelemetry.py --test-cmd "import ${FRAMEWORK}" 2>&1 | tee $TRAINING_LOG + verify_test_results "$TRAINING_LOG" || exit 1 + ;; + *) + echo "Error: Invalid parameter provided" + echo "Available call types: bashrc, entrypoint, framework" + exit 1 + ;; +esac + +exit 0 + diff --git a/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test.py b/test/dlc_tests/container_tests/bin/testTelemetry.py similarity index 77% rename from test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test.py rename to test/dlc_tests/container_tests/bin/testTelemetry.py index 091e5e722134..773c51dedc1b 100644 --- a/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test.py +++ b/test/dlc_tests/container_tests/bin/testTelemetry.py @@ -1,6 +1,7 @@ import os import numpy as np import time +import argparse def _clean_up_reports(): @@ -18,7 +19,7 @@ def opt_in_opt_out_test(exec_cmd): os.environ["OPT_OUT_TRACKING"] = opt_out_value cmd = f"python -c '{exec_cmd}'" os.system(cmd) - time.sleep(5) + time.sleep(20) assert not os.path.exists( "/tmp/test_request.txt" ), f"URL request placed even though OPT_OUT_TRACKING is {opt_out_value}." @@ -31,7 +32,7 @@ def opt_in_opt_out_test(exec_cmd): os.environ["OPT_OUT_TRACKING"] = opt_out_value cmd = f"python -c '{exec_cmd}'" os.system(cmd) - time.sleep(5) + time.sleep(20) assert os.path.exists( "/tmp/test_request.txt" ), f"URL request not placed even though OPT_OUT_TRACKING is {opt_out_value}." @@ -43,7 +44,7 @@ def opt_in_opt_out_test(exec_cmd): print("Opt-In/Opt-Out Test passed") -def performance_test(exec_cmd): +def perf_test(exec_cmd): os.environ["TEST_MODE"] = "0" os.environ["OPT_OUT_TRACKING"] = "False" NUM_ITERATIONS = 5 @@ -55,7 +56,7 @@ def performance_test(exec_cmd): start = time.time() os.system(cmd) total_time_in += time.time() - start - print("avg out time: ", total_time_in / NUM_ITERATIONS) + print("avg in time: ", total_time_in / NUM_ITERATIONS) total_time_out = 0 for x in range(NUM_ITERATIONS): @@ -72,12 +73,25 @@ def performance_test(exec_cmd): print("DLC Telemetry performance test Passed") -# test framework functionality -performance_test("import tensorflow") -opt_in_opt_out_test("import tensorflow") +def run_tests(test_cmd): + print(f"Running tests with command: {test_cmd}") + perf_test(test_cmd) + opt_in_opt_out_test(test_cmd) + print("All DLC telemetry test passed") -# Disabling os tests until it is added to all new images -# performance_test("import os") -# opt_in_opt_out_test("import os") -print("All DLC telemetry test passed") +def main(): + parser = argparse.ArgumentParser(description="Run DLC telemetry tests") + parser.add_argument( + "--test-cmd", + type=str, + required=True, + help="The Python command to test", + ) + args = parser.parse_args() + + run_tests(args.test_cmd) + + +if __name__ == "__main__": + main() diff --git a/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test b/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test deleted file mode 100644 index 242f4b2c0665..000000000000 --- a/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -HOME_DIR=/test -BIN_DIR=${HOME_DIR}/bin -LOG_DIR=${HOME_DIR}/logs -TRAINING_LOG=${LOG_DIR}/tensorflow_telemetry_test.log -set -e - -echo "Simply verify if Telemetry works well. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG - -python ${BIN_DIR}/test_tf_dlc_telemetry_test.py 2>&1 | tee $TRAINING_LOG - -if grep "DLC Telemetry performance test Passed" $TRAINING_LOG; then - echo "Successfully verified Telemetry performance test." -else - echo "Telemetry performance test failed." - cat ${TRAINING_LOG} - exit 1 -fi - -if grep "Opt-In/Opt-Out Test passed" $TRAINING_LOG; then - echo "Successfully verified Opt-In/Opt-Out Test " -else - echo "Opt-In/Opt-Out Test failed." - cat ${TRAINING_LOG} - exit 1 -fi - -exit 0 diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py index b645d1096fe5..0a5f33067af3 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py @@ -19,6 +19,7 @@ get_ec2_instance_type, execute_ec2_inference_test, get_ec2_accelerator_type, + execute_ec2_telemetry_test, ) from test.dlc_tests.conftest import LOGGER @@ -62,9 +63,7 @@ default="inf2.xlarge", processor="neuronx", job_type="inference" ) -PT_TELEMETRY_CMD = os.path.join( - CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test" -) +PT_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTelemetry") PT_TORCHAUDIO_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchaudio") PT_TORCHDATA_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdata") PT_TORCHDATA_DEV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdataDev") @@ -403,14 +402,106 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) @pytest.mark.team("conda") -def test_pytorch_inference_telemetry_gpu( +def test_pytorch_inference_telemetry_entrypoint_gpu( + pytorch_inference, ec2_connection, gpu_only, ec2_instance_type, pt15_and_above_only +): + if test_utils.is_image_incompatible_with_instance_type(pytorch_inference, ec2_instance_type): + pytest.skip( + f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}" + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) + + +@pytest.mark.usefixtures("sagemaker", "stabilityai") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) +@pytest.mark.team("conda") +def test_pytorch_inference_telemetry_bashrc_gpu( + pytorch_inference, ec2_connection, gpu_only, ec2_instance_type, pt15_and_above_only +): + if test_utils.is_image_incompatible_with_instance_type(pytorch_inference, ec2_instance_type): + pytest.skip( + f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}" + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) + + +@pytest.mark.usefixtures("sagemaker", "stabilityai") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) +@pytest.mark.team("conda") +def test_pytorch_inference_telemetry_framework_gpu( pytorch_inference, ec2_connection, gpu_only, ec2_instance_type, pt15_and_above_only ): if test_utils.is_image_incompatible_with_instance_type(pytorch_inference, ec2_instance_type): pytest.skip( f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}" ) - execute_ec2_inference_test(ec2_connection, pytorch_inference, PT_TELEMETRY_CMD) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "framework", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +@pytest.mark.team("conda") +def test_pytorch_inference_telemetry_entrypoint_cpu( + pytorch_inference, ec2_connection, cpu_only, pt15_and_above_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) @pytest.mark.usefixtures("sagemaker") @@ -418,10 +509,42 @@ def test_pytorch_inference_telemetry_gpu( @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True) @pytest.mark.team("conda") -def test_pytorch_inference_telemetry_cpu( +def test_pytorch_inference_telemetry_bashrc_cpu( pytorch_inference, ec2_connection, cpu_only, pt15_and_above_only ): - execute_ec2_inference_test(ec2_connection, pytorch_inference, PT_TELEMETRY_CMD) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +@pytest.mark.team("conda") +def test_pytorch_inference_telemetry_framework_cpu( + pytorch_inference, ec2_connection, cpu_only, pt15_and_above_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference, + "framework", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + ) @pytest.mark.usefixtures("sagemaker") @@ -431,10 +554,35 @@ def test_pytorch_inference_telemetry_cpu( @pytest.mark.parametrize( "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True ) -def test_pytorch_inference_telemetry_graviton_cpu( +def test_pytorch_inference_telemetry_framework_graviton_cpu( pytorch_inference_graviton, ec2_connection, cpu_only ): - execute_ec2_inference_test(ec2_connection, pytorch_inference_graviton, PT_TELEMETRY_CMD) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_graviton, + "framework", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_ARM64_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_inference_telemetry_framework_arm64_cpu( + pytorch_inference_arm64, ec2_connection, cpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "framework", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + ) @pytest.mark.usefixtures("sagemaker") @@ -444,8 +592,53 @@ def test_pytorch_inference_telemetry_graviton_cpu( @pytest.mark.parametrize( "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True ) -def test_pytorch_inference_telemetry_arm64_cpu(pytorch_inference_arm64, ec2_connection, cpu_only): - execute_ec2_inference_test(ec2_connection, pytorch_inference_arm64, PT_TELEMETRY_CMD) +def test_pytorch_inference_telemetry_entrypoint_arm64_cpu( + pytorch_inference_arm64, ec2_connection, cpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_ARM64_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_inference_telemetry_bashrc_arm64_cpu( + pytorch_inference_arm64, ec2_connection, cpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) @pytest.mark.usefixtures("sagemaker") @@ -455,10 +648,16 @@ def test_pytorch_inference_telemetry_arm64_cpu(pytorch_inference_arm64, ec2_conn @pytest.mark.parametrize( "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True ) -def test_pytorch_inference_telemetry_graviton_gpu( +def test_pytorch_inference_telemetry_framework_graviton_gpu( pytorch_inference_graviton, ec2_connection, gpu_only ): - execute_ec2_inference_test(ec2_connection, pytorch_inference_graviton, PT_TELEMETRY_CMD) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_graviton, + "framework", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + ) @pytest.mark.usefixtures("sagemaker") @@ -468,5 +667,69 @@ def test_pytorch_inference_telemetry_graviton_gpu( @pytest.mark.parametrize( "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True ) -def test_pytorch_inference_telemetry_arm64_gpu(pytorch_inference_arm64, ec2_connection, gpu_only): - execute_ec2_inference_test(ec2_connection, pytorch_inference_arm64, PT_TELEMETRY_CMD) +def test_pytorch_inference_telemetry_framework_arm64_gpu( + pytorch_inference_arm64, ec2_connection, gpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "framework", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_inference_telemetry_entrypoint_arm64_gpu( + pytorch_inference_arm64, ec2_connection, gpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "entrypoint", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_inference_telemetry_bashrc_arm64_gpu( + pytorch_inference_arm64, ec2_connection, gpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=False, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_inference_arm64, + "bashrc", + "pytorch_inf_telemetry", + test_cmd=PT_TELEMETRY_CMD, + opt_in=True, + ) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index d45e0b7ca834..8f8cc7cc03e3 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -14,6 +14,7 @@ ) from test.test_utils.ec2 import ( execute_ec2_training_test, + execute_ec2_telemetry_test, get_ec2_instance_type, get_efa_ec2_instance_type, ) @@ -29,9 +30,7 @@ PT_TORCHDATA_DEV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdataDev") PT_TORCHDATA_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdata") PT_DGL_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "dgl_tests", "testPyTorchDGL") -PT_TELEMETRY_CMD = os.path.join( - CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test" -) +PT_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTelemetry") PT_COMMON_GLOO_MPI_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchGlooMpi") PT_COMMON_NCCL_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchNccl") PT_AMP_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchAMP") @@ -149,16 +148,102 @@ def pytorch_training_dgl(pytorch_training, ec2_connection): ) -def pytorch_telemetry(pytorch_training, ec2_connection): +def pytorch_telemetry_entrypoint_cpu(pytorch_training, ec2_connection): """ Test Telemetry """ - execute_ec2_training_test( + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "entrypoint", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "entrypoint", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + opt_in=False, + ) + + +def pytorch_telemetry_bashrc_cpu(pytorch_training, ec2_connection): + execute_ec2_telemetry_test( ec2_connection, pytorch_training, + "bashrc", + "pytorch_tr_telemetry", PT_TELEMETRY_CMD, - timeout=900, - container_name="pytorch_telemetry", + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "bashrc", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + opt_in=False, + ) + + +def pytorch_telemetry_framework_cpu(pytorch_training, ec2_connection): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "framework", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + ) + + +def pytorch_telemetry_entrypoint_gpu(pytorch_training, ec2_connection): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "entrypoint", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "entrypoint", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + opt_in=False, + ) + + +def pytorch_telemetry_bashrc_gpu(pytorch_training, ec2_connection): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "bashrc", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "bashrc", + "pytorch_tr_telemetry", + PT_TELEMETRY_CMD, + opt_in=False, + ) + + +def pytorch_telemetry_framework_gpu(pytorch_training, ec2_connection): + execute_ec2_telemetry_test( + ec2_connection, + pytorch_training, + "framework", + "pytorch_tr_telemetry", + test_cmd=PT_TELEMETRY_CMD, ) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index 1cd7a8bba47c..944620fa2df3 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -36,9 +36,6 @@ PT_AMP_INDUCTOR_CMD = os.path.join( CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchAMPwithInductor" ) -PT_TELEMETRY_CMD = os.path.join( - CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test" -) PT_HABANA_TEST_SUITE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testHabanaPTSuite") PT_TORCHAUDIO_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchaudio") PT_TORCHDATA_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdata") @@ -681,17 +678,6 @@ def test_pytorch_training_torchdata_cpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD) -@pytest.mark.skip_serialized_release_pt_test -@pytest.mark.usefixtures("feature_aws_framework_present") -@pytest.mark.usefixtures("sagemaker") -@pytest.mark.integration("telemetry") -@pytest.mark.model("N/A") -@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True) -@pytest.mark.team("conda") -def test_pytorch_telemetry_cpu(pytorch_training, ec2_connection, cpu_only, pt15_and_above_only): - execute_ec2_training_test(ec2_connection, pytorch_training, PT_TELEMETRY_CMD, timeout=900) - - @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", PT_EC2_HPU_INSTANCE_TYPE, indirect=True) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_4.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_4.py index 4dcae8cf0355..7d1bbc7d6c71 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_4.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_4.py @@ -123,7 +123,7 @@ def test_pytorch_2_4_cpu(pytorch_training___2__4, ec2_connection, cpu_only): (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_framework_cpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py index 1b1f9b088e13..b1e698c5c045 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py @@ -122,7 +122,7 @@ def test_pytorch_2_5_cpu(pytorch_training___2__5, ec2_connection, cpu_only): (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_framework_cpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_6.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_6.py index 4b6c1f1b4f47..67afa9b44f8e 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_6.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_6.py @@ -35,6 +35,7 @@ def test_pytorch_2_6_gpu( (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_framework_gpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: @@ -124,7 +125,7 @@ def test_pytorch_2_6_cpu(pytorch_training___2__6, ec2_connection, cpu_only): (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_framework_cpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py index 79a209da626d..a1740105e474 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py @@ -32,7 +32,7 @@ def test_pytorch_2_7_gpu( (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_framework_gpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: diff --git a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py index e3ffa3f9691b..8d7386311694 100644 --- a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py +++ b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py @@ -9,8 +9,13 @@ import test.test_utils.ec2 as ec2_utils from test import test_utils -from test.test_utils.ec2 import get_ec2_instance_type, get_ec2_accelerator_type +from test.test_utils.ec2 import ( + get_ec2_instance_type, + get_ec2_accelerator_type, + execute_ec2_telemetry_test, +) from test.dlc_tests.conftest import LOGGER +from test.test_utils import CONTAINER_TESTS_PREFIX TENSORFLOW1_VERSION = "1." TENSORFLOW2_VERSION = "2." @@ -21,6 +26,7 @@ TF_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia") TF_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_instance_type(default="inf1.xlarge", processor="neuron") TF_EC2_NEURONX_ACCELERATOR_TYPE = get_ec2_instance_type(default="trn1.2xlarge", processor="neuronx") +TF_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTelemetry") TF_EC2_NEURONX_INF2_ACCELERATOR_TYPE = get_ec2_instance_type( default="inf2.xlarge", processor="neuronx" ) @@ -244,7 +250,7 @@ def test_ec2_tensorflow_inference_eia_gpu( @pytest.mark.model("mnist") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) -def test_ec2_tensorflow_inference_gpu_telemetry( +def test_ec2_tensorflow_inference_telemetry_framework_gpu( tensorflow_inference, ec2_connection, region, gpu_only, ec2_instance_type ): if test_utils.is_image_incompatible_with_instance_type(tensorflow_inference, ec2_instance_type): @@ -254,16 +260,70 @@ def test_ec2_tensorflow_inference_gpu_telemetry( run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500", region, True) +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.model("mnist") +@pytest.mark.team("frameworks") +@pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) +def test_ec2_tensorflow_inference_telemetry_bashrc_gpu( + tensorflow_inference, ec2_connection, region, gpu_only, ec2_instance_type +): + if test_utils.is_image_incompatible_with_instance_type(tensorflow_inference, ec2_instance_type): + pytest.skip( + f"Image {tensorflow_inference} is incompatible with instance type {ec2_instance_type}" + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_inference, + "bashrc", + "tensorflow_inf_telemetry", + TF_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_inference, + "bashrc", + "tensorflow_inf_telemetry", + TF_TELEMETRY_CMD, + opt_in=False, + ) + + @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("mnist") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_CPU_INSTANCE_TYPE, indirect=True) -def test_ec2_tensorflow_inference_cpu_telemetry( +def test_ec2_tensorflow_inference_telemetry_framework_cpu( tensorflow_inference, ec2_connection, region, cpu_only ): run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500", region, True) +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.model("mnist") +@pytest.mark.team("frameworks") +@pytest.mark.parametrize("ec2_instance_type", TF_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_ec2_tensorflow_inference_telemetry_bashrc_cpu( + tensorflow_inference, ec2_connection, region, cpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_inference, + "bashrc", + "tensorflow_inf_telemetry", + TF_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_inference, + "bashrc", + "tensorflow_inf_telemetry", + TF_TELEMETRY_CMD, + opt_in=False, + ) + + @pytest.mark.model("mnist") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_GRAVITON_INSTANCE_TYPE, indirect=True) @pytest.mark.parametrize( @@ -291,7 +351,7 @@ def test_ec2_tensorflow_inference_arm64_cpu( @pytest.mark.parametrize( "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True ) -def test_ec2_tensorflow_inference_graviton_cpu_telemetry( +def test_ec2_tensorflow_inference_graviton_telemetry_framework_cpu( tensorflow_inference_graviton, ec2_connection, region, cpu_only ): run_ec2_tensorflow_inference( @@ -304,12 +364,38 @@ def test_ec2_tensorflow_inference_graviton_cpu_telemetry( @pytest.mark.parametrize( "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True ) -def test_ec2_tensorflow_inference_arm64_cpu_telemetry( +def test_ec2_tensorflow_inference_arm64_telemetry_framework_cpu( tensorflow_inference_arm64, ec2_connection, region, cpu_only ): run_ec2_tensorflow_inference(tensorflow_inference_arm64, ec2_connection, "8500", region, True) +@pytest.mark.model("mnist") +@pytest.mark.parametrize("ec2_instance_type", TF_EC2_ARM64_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_ec2_tensorflow_inference_arm64_telemetry_bashrc_cpu( + tensorflow_inference_arm64, ec2_connection, region, cpu_only +): + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_inference_arm64, + "bashrc", + "tensorflow_inf_telemetry", + TF_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_inference_arm64, + "bashrc", + "tensorflow_inf_telemetry", + TF_TELEMETRY_CMD, + opt_in=False, + ) + + def run_ec2_tensorflow_inference( image_uri, ec2_connection, grpc_port, region, telemetry_mode=False ): diff --git a/test/dlc_tests/ec2/tensorflow/training/test_tensorflow_training.py b/test/dlc_tests/ec2/tensorflow/training/test_tensorflow_training.py index 90d8830377b6..ae4dd5740f30 100644 --- a/test/dlc_tests/ec2/tensorflow/training/test_tensorflow_training.py +++ b/test/dlc_tests/ec2/tensorflow/training/test_tensorflow_training.py @@ -11,7 +11,11 @@ LOGGER, is_tf_version, ) -from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type +from test.test_utils.ec2 import ( + execute_ec2_training_test, + get_ec2_instance_type, + execute_ec2_telemetry_test, +) TF1_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorflow1Standalone") @@ -20,7 +24,7 @@ TF1_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF1HVD") TF2_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF2HVD") TF_OPENCV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testOpenCV") -TF_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "test_tf_dlc_telemetry_test") +TF_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTelemetry") TF_KERAS_HVD_CMD_AMP = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDAMP") TF_KERAS_HVD_CMD_FP32 = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDFP32") TF_TENSORBOARD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorBoard") @@ -196,30 +200,154 @@ def test_tensorflow_opencv_cpu(tensorflow_training, ec2_connection, tf2_only, cp execute_ec2_training_test(ec2_connection, tensorflow_training, TF_OPENCV_CMD) -# Testing Telemetry Script on only one GPU instance @pytest.mark.usefixtures("sagemaker") @pytest.mark.flaky(reruns=3) @pytest.mark.integration("telemetry") @pytest.mark.model("N/A") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) -def test_tensorflow_telemetry_gpu(tensorflow_training, ec2_connection, gpu_only, ec2_instance_type): +def test_tensorflow_telemetry_entrypoint_gpu( + tensorflow_training, ec2_connection, gpu_only, ec2_instance_type +): + if test_utils.is_image_incompatible_with_instance_type(tensorflow_training, ec2_instance_type): + pytest.skip( + f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}" + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "entrypoint", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "entrypoint", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=False, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.flaky(reruns=3) +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.team("frameworks") +@pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) +def test_tensorflow_telemetry_bashrc_gpu( + tensorflow_training, ec2_connection, gpu_only, ec2_instance_type +): if test_utils.is_image_incompatible_with_instance_type(tensorflow_training, ec2_instance_type): pytest.skip( f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}" ) - execute_ec2_training_test(ec2_connection, tensorflow_training, TF_TELEMETRY_CMD) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "bashrc", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "bashrc", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=False, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.flaky(reruns=3) +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.team("frameworks") +@pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) +def test_tensorflow_telemetry_framework_gpu( + tensorflow_training, ec2_connection, gpu_only, ec2_instance_type +): + if test_utils.is_image_incompatible_with_instance_type(tensorflow_training, ec2_instance_type): + pytest.skip( + f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}" + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "framework", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + ) -# Testing Telemetry Script on only one CPU instance @pytest.mark.usefixtures("sagemaker") @pytest.mark.flaky(reruns=3) @pytest.mark.integration("telemetry") @pytest.mark.model("N/A") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_CPU_INSTANCE_TYPE, indirect=True) -def test_tensorflow_telemetry_cpu(tensorflow_training, ec2_connection, cpu_only): - execute_ec2_training_test(ec2_connection, tensorflow_training, TF_TELEMETRY_CMD) +def test_tensorflow_telemetry_entrypoint_cpu(tensorflow_training, ec2_connection, cpu_only): + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "entrypoint", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "entrypoint", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=False, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.flaky(reruns=3) +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.team("frameworks") +@pytest.mark.parametrize("ec2_instance_type", TF_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_tensorflow_telemetry_bashrc_cpu(tensorflow_training, ec2_connection, cpu_only): + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "bashrc", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=True, + ) + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "bashrc", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + opt_in=False, + ) + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.flaky(reruns=3) +@pytest.mark.integration("telemetry") +@pytest.mark.model("N/A") +@pytest.mark.team("frameworks") +@pytest.mark.parametrize("ec2_instance_type", TF_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_tensorflow_telemetry_framework_cpu(tensorflow_training, ec2_connection, cpu_only): + execute_ec2_telemetry_test( + ec2_connection, + tensorflow_training, + "framework", + "tensorflow_tr_telemetry", + TF_TELEMETRY_CMD, + ) # Skip test for TF 2.0 and below: https://github.com/tensorflow/tensorflow/issues/33484#issuecomment-555299647 diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index 2968b3204d76..5f3a825e74be 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -1299,6 +1299,103 @@ def execute_ec2_training_test( return ec2_res +def execute_ec2_telemetry_test( + connection, + ecr_uri, + call_type, + container_name, + test_cmd, + opt_in=False, + region=DEFAULT_REGION, + timeout=900, +): + """ + Execute telemetry tests on EC2 instances using Docker containers. + + Args: + connection: EC2 connection object + ecr_uri (str): ECR image URI + call_type (str): Type of test to run ('bashrc', 'entrypoint', 'framework') + container_name (str): Base name for the container + test_cmd (str): Test command to execute + opt_in (bool): Whether to run in opt-in mode (default: False) + region (str): AWS region + timeout (int): Timeout in seconds (default: 900) + + Returns: + Result object from the connection.run command + + Raises: + RuntimeError: If invalid call_type is provided + """ + # Validate call type + VALID_CALL_TYPES = {"bashrc", "entrypoint", "framework"} + if call_type not in VALID_CALL_TYPES: + raise RuntimeError(f"Invalid call_type. Must be one of: {', '.join(VALID_CALL_TYPES)}") + + # Set up Docker runtime configuration + docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else "" + if "pytorch" in ecr_uri: + framework_env = f"-e FRAMEWORK='torch'" + elif "tensorflow" in ecr_uri: + framework_env = f"-e FRAMEWORK='tensorflow'" + else: + framework_env = "" + opt_out_env = "" if opt_in else "-e OPT_OUT_TRACKING='true'" + + # Set up container and mount configuration + test_suffix = "opt_in" if opt_in else "opt_out" + container_name = ( + f"{container_name}_{call_type}_{test_suffix}" + if call_type in {"bashrc", "entrypoint"} + else f"{container_name}_{call_type}" + ) + + container_test_local_dir = os.path.join("$HOME", "container_tests") + mount_path = f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')}" + + # Prepare test command + test_cmd = f"{test_cmd} {call_type} {test_suffix}" + LOGGER.info(f"Executing test: {test_cmd}") + + # for entrypoint test, we aviod invoking bashrc telemetry + nobashrc_cmd = f"bash --norc" if call_type == "entrypoint" else "" + + # for other tests, we need to aviod using entrypoint telemetry + entrypoint_override = f"--entrypoint /bin/bash" if call_type != "entrypoint" else "" + + try: + # Login to ECR and pull image + account_id = get_account_id_from_image_uri(ecr_uri) + login_to_ecr_registry(connection, account_id, region) + + LOGGER.info(f"Pulling image: {ecr_uri}") + connection.run(f"docker pull {ecr_uri}", hide="out") + + # Execute test based on call type + # Start container + connection.run( + f"docker run {docker_runtime} --name {container_name} " + f" {mount_path} " + f"-itd -e TEST_MODE='1' {framework_env} {opt_out_env} {entrypoint_override} {ecr_uri} {nobashrc_cmd}", + hide=True, + ) + + # Execute test command + ec2_res = connection.run( + f"docker exec --user root {container_name} bash -c '{test_cmd}'", + hide=True, + timeout=timeout, + ) + + LOGGER.info(f"Test completed for {call_type} on {ecr_uri}") + return ec2_res + + except Exception as e: + LOGGER.error(f"Test failed: {str(e)}") + raise + + def execute_ec2_inference_test(connection, ecr_uri, test_cmd, region=DEFAULT_REGION): docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else "" container_test_local_dir = os.path.join("$HOME", "container_tests")