Skip to content

Add bash file and modify docker #4773

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 93 commits into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
93 commits
Select commit Hold shift + click to select a range
d37aa99
add bashfile and modify docker
May 1, 2025
6f359b0
add cpu index
May 1, 2025
165be7b
fix cpu
May 2, 2025
ebbabef
increase size
May 2, 2025
095e988
rebuild and fix script
May 2, 2025
e2a0ba3
reformat
May 2, 2025
02f3932
modify test
May 2, 2025
b2dd1aa
Merge branch 'master' into bash
May 2, 2025
22933a4
fix test
May 2, 2025
c8084c9
modify part of test
May 8, 2025
24d1f18
revert some changes
May 8, 2025
7c62f90
modify telemetry test
May 9, 2025
102cb3d
refactor test to shell script
May 9, 2025
707843e
Merge branch 'master' into bash
Yadan-Wei May 9, 2025
eed9365
build sm image
May 9, 2025
7a87bca
rebuild and test
May 12, 2025
885b999
build inference ec2 image
May 12, 2025
9bc2073
build tf training image
May 13, 2025
63a9ba0
fix directory
May 13, 2025
57218d9
rebuild tf tr
May 13, 2025
22ecb86
build pt2.6 ec2 images
May 13, 2025
287d28d
format
May 13, 2025
e16509b
Merge branch 'master' into bash
Yadan-Wei May 13, 2025
247d5fe
fix some error
May 13, 2025
8ad2fb5
build cpu without sitecustomize
May 13, 2025
9a4f84c
build TF218 without sitecustomize
May 13, 2025
282484e
modify tf tr test
May 14, 2025
19145dd
fix entrypoint test
May 14, 2025
36da104
rebuild pt2.6 ec2 without sitecustomize
May 14, 2025
b151274
fix pytorch test failure
May 14, 2025
c29fcec
fix cpu entrypoint
May 14, 2025
003e700
fix tf218 cpu entrypoint
May 14, 2025
2293704
rebuild with updated tf218 entrypoint
May 14, 2025
d2fb9f6
rebuild with updated pt2.6 entrypoint
May 14, 2025
e9335cc
test PT2.6 inf ec2
May 14, 2025
8ea000b
run TF 2.18 inference
May 15, 2025
2a99697
run TF 2.18 inference
May 15, 2025
937c056
run TF 2.18 inference fix entrypoint
May 15, 2025
89c62c8
build tf inf sm with new entrypoint
May 15, 2025
26047c2
build tf tr sm
May 15, 2025
b9ebd2a
build pt tr ec2
May 15, 2025
dffaf73
build pt tr sm
May 15, 2025
4b00b97
rebuild tf218 tr sm
May 15, 2025
8f35cd4
rebuild tf218 inf sm
May 15, 2025
27497b3
disable tf inf sm 218 autopatch
May 15, 2025
8a9e954
rebuild tf2.18 tr sm with updated entrypoint
May 15, 2025
0fc84a3
build pt2.6 tr ec2
May 15, 2025
9b3a228
build pt2.6 tr ec2
May 15, 2025
438ef6f
rebuild tf218 inf sm
May 15, 2025
59547b9
build PT2.6 Inf ec2
May 15, 2025
e53649b
build PT2.6 Inf sm
May 15, 2025
28023f4
test skip logic
May 16, 2025
871d87a
fix typo
May 16, 2025
27c4489
modify test and run for PT2.6 inf sm
May 16, 2025
9e9c6c6
modify test and run for PT2.6 inf sm remove version skip
May 16, 2025
f913ac9
fix test
May 19, 2025
6226baf
test PT inf ec2'
May 19, 2025
772cb40
test pt tr ec2
May 19, 2025
8cd0e65
test pt tr sm
May 19, 2025
2796e98
build tf tr 218 sm and test
May 19, 2025
2f51dba
revert temp change
May 19, 2025
d3088af
revert buildspec
May 19, 2025
b4796df
test PT2.6 tr ec2 autopatch
May 19, 2025
3c5e3aa
test PT2.5 tr ec2 prod
May 19, 2025
378838c
Revert "test PT2.5 tr ec2 prod"
May 19, 2025
2d1c8af
test PT2.5 tr ec2 prod
May 19, 2025
0535e84
disable autopatch
May 19, 2025
6d2b12e
test PT 2.4 training prod
May 19, 2025
6061fae
test pt2.6 tr autopatch
May 19, 2025
669ba11
Merge branch 'master' into bash
Yadan-Wei May 19, 2025
9edcc0a
test arm64 training prod
May 19, 2025
dfe2836
test arm64 PT2.6
May 19, 2025
2f385a1
Merge branch 'master' into bash
Yadan-Wei May 19, 2025
992ebb4
build arm64 inf
May 19, 2025
1c3b37f
Merge branch 'master' into bash
Yadan-Wei May 19, 2025
849c315
test PT2.6 training autopatch image
May 19, 2025
56d37e8
set arm64 mode to false
May 19, 2025
fc7c663
fix confest
May 19, 2025
8d58d15
do not build
May 19, 2025
3c0ce32
do not build arm64 inf
May 19, 2025
bf701c1
do not build arm64 inf
May 19, 2025
a487a16
test PT2.5 Training Prod
May 19, 2025
bde0aa0
test TF 218 tr ec2
May 20, 2025
449206f
test TF218 inf
May 20, 2025
d139d8e
add skip tensorflow inference logic
May 20, 2025
005bb3f
test TF218 inf arm64
May 20, 2025
e1801ee
fix typo and revert some files
May 20, 2025
e48870d
revert toml
May 20, 2025
3164d8e
add base
May 20, 2025
3bdbde7
revert buildspec
May 20, 2025
dbacf9e
address comments
May 21, 2025
0b6b7ae
remove sitecustomize test
May 21, 2025
2833e85
Merge branch 'master' into bash
Yadan-Wei May 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions miscellaneous_scripts/bash_telemetry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# telemetry.sh
#!/bin/bash
if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then
(
python /usr/local/bin/deep_learning_container.py \
--framework "${FRAMEWORK}" \
--framework-version "${FRAMEWORK_VERSION}" \
--container-type "${CONTAINER_TYPE}" \
&>/dev/null &
)
fi

Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@
import sys


try:
subprocess.call(
["/bin/bash", "/usr/local/bin/bash_telemetry.sh"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except:
pass


subprocess.check_call(shlex.split(" ".join(sys.argv[1:])))

# prevent docker exit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@
import signal
import torch

try:
subprocess.call(
["/bin/bash", "/usr/local/bin/bash_telemetry.sh"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except:
pass

if torch.cuda.is_available():
# run compat mounting by default
try:
Expand Down
4 changes: 2 additions & 2 deletions pytorch/training/buildspec-2-5-ec2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ images:
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# build_tag_override: "beta:2.5.1-cpu-py311-ubuntu22.04-ec2"
# build_tag_override: "true"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
context:
Expand All @@ -58,7 +58,7 @@ images:
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# build_tag_override: "beta:2.5.1-gpu-py311-cu121-ubuntu22.04-ec2"
# build_tag_override: "true"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/usr/bin/env bash
# Check if telemetry file exists before executing
# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to log error on fail, something like:

bash /usr/local/bin/bash_telemetry.sh >>/var/log/telemetry.log 2>&1 || echo "Telemetry script failed" >&2


CUDA_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())")
if [ "$CUDA_AVAILABLE" = "True" ]; then
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#!/usr/bin/env bash

# telemetry.sh
# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here



CUDA_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())")
if [ "$CUDA_AVAILABLE" = "True" ]; then
bash /usr/local/bin/start_cuda_compat.sh
Expand Down
24 changes: 20 additions & 4 deletions src/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,26 +255,42 @@ def image_builder(buildspec, image_types=[], device_types=[]):
f"This is required to set job_type label."
)

template_file = os.path.join(
sitecustomize_template_file = os.path.join(
os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "dlc_template.py"
)
bash_template_file = os.path.join(
os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "bash_telemetry.sh"
)

template_fw_version = (
str(image_config["framework_version"])
if image_config.get("framework_version")
else str(BUILDSPEC["version"])
)
template_fw = str(BUILDSPEC["framework"])
post_template_file = utils.generate_dlc_cmd(
template_path=template_file,
sitecustomize_post_template_file = utils.generate_dlc_cmd(
template_path=sitecustomize_template_file,
output_path=os.path.join(image_config["root"], "out.py"),
framework=template_fw,
framework_version=template_fw_version,
container_type=label_job_type,
)
bash_post_template_file = utils.generate_dlc_cmd(
template_path=bash_template_file,
output_path=os.path.join(image_config["root"], "telemetry.sh"),
framework=template_fw,
framework_version=template_fw_version,
container_type=label_job_type,
)

ARTIFACTS.update(
{"customize": {"source": post_template_file, "target": "sitecustomize.py"}}
{
"customize": {
"source": sitecustomize_post_template_file,
"target": "sitecustomize.py",
},
"bash": {"source": bash_post_template_file, "target": "bash_telemetry.sh"},
}
)

context = Context(ARTIFACTS, f"build/{image_name}.tar.gz", image_config["root"])
Expand Down
3 changes: 2 additions & 1 deletion src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,8 @@ def generate_dlc_cmd(template_path, output_path, framework, framework_version, c
}

for anchor, value in replacements.items():
content = content.replace(f"{{{anchor}}}", value)
content = content.replace(f"${{{anchor}}}", value) # replace ${VARIABLE} with value
content = content.replace(f"{{{anchor}}}", value) # replace {VARIABLE} with value

with open(output_path, "w") as out_f:
out_f.write(content)
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/inference/buildspec-2-18-ec2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ images:
"-ec2" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
# build_tag_override: "beta:2.16.1-cpu-py310-ubuntu20.04-ec2"
# build_tag_override: "true"
context:
<<: *INFERENCE_CONTEXT
BuildEC2TensorflowGPUInferencePy3DockerImage:
Expand All @@ -67,6 +67,6 @@ images:
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
# build_tag_override: "beta:2.16.1-gpu-py310-cu122-ubuntu20.04-ec2"
# build_tag_override: "true"
context:
<<: *INFERENCE_CONTEXT
2 changes: 1 addition & 1 deletion tensorflow/inference/buildspec-arm64-2-18-ec2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,6 @@ images:
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.arm64., *DEVICE_TYPE ]
target: ec2
# build_tag_override: "beta:2.16.1-cpu-py310-ubuntu20.04-ec2"
# build_tag_override: "true"
context:
<<: *INFERENCE_CONTEXT
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env bash

# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

TF_SERVING_PACKAGE=$(pip list | grep tensorflow-serving | cut -d ' ' -f 1)

Expand Down
3 changes: 3 additions & 0 deletions tensorflow/training/buildspec-2-18-ec2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ context:
dockerd-entrypoint:
source: docker/build_artifacts/dockerd-entrypoint.py
target: dockerd-entrypoint.py
dockerd_ec2_entrypoint:
source: docker/build_artifacts/dockerd_ec2_entrypoint.sh
target: dockerd_ec2_entrypoint.sh
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
Expand Down
3 changes: 3 additions & 0 deletions tensorflow/training/buildspec-2-18-sm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ context:
dockerd-entrypoint:
source: docker/build_artifacts/dockerd-entrypoint.py
target: dockerd-entrypoint.py
dockerd_ec2_entrypoint:
source: docker/build_artifacts/dockerd_ec2_entrypoint.sh
target: dockerd_ec2_entrypoint.sh
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,21 @@
import sys
import tensorflow as tf

try:
subprocess.call(
["/bin/bash", "/usr/local/bin/bash_telemetry.sh"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except:
pass


if tf.test.is_built_with_cuda():
# run compat mounting by default
try:
subprocess.run(["bash", "-m", "/usr/local/bin/start_cuda_compat.sh"])
except Exception as e:
print(f"Error running script: {e}")

if not os.path.exists("/opt/ml/input/config"):
subprocess.call(["python", "/usr/local/bin/deep_learning_container.py", "&>/dev/null", "&"])

subprocess.check_call(shlex.split(" ".join(sys.argv[1:])))
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash

# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

eval "$@"
57 changes: 57 additions & 0 deletions test/dlc_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@
"feature_s3_plugin_present": {NightlyFeatureLabel.AWS_S3_PLUGIN_INSTALLED.value},
}

# Skip telemetry tests for specific versions
TELEMETRY_SKIP_VERSIONS = {
"entrypoint": {"pytorch": ["2.4.0", "2.5.1", "2.6.0"], "tensorflow": ["2.18.0"]},
"bashrc": {"pytorch": ["2.4.0", "2.5.1", "2.6.0"], "tensorflow": ["2.18.0"]},
"framework": {"pytorch": [""], "tensorflow": [""]},
}


# Nightly fixtures
@pytest.fixture(scope="session")
Expand Down Expand Up @@ -1004,6 +1011,56 @@ def skip_serialized_release_pt_test(request):
)


@pytest.fixture(autouse=True)
def skip_telemetry_tests(request):
"""Skip specific telemetry tests based on test name and image version"""
test_name = request.node.name.lower()

if "telemetry_entrypoint" in test_name:
_check_telemetry_skip(request, "entrypoint")
elif "telemetry_bashrc" in test_name:
_check_telemetry_skip(request, "bashrc")
elif "telemetry_framework" in test_name:
_check_telemetry_skip(request, "framework")


def _get_telemetry_image_info(request):
"""Helper function to get image URI and framework info from fixtures."""
telemetry_framework_fixtures = [
"pytorch_training",
"tensorflow_training",
"tensorflow_inference",
"pytorch_inference",
"pytorch_inference_arm64",
"pytorch_training_arm64",
"tensorflow_inference_arm64",
]

for fixture_name in telemetry_framework_fixtures:
if fixture_name in request.fixturenames:
img_uri = request.getfixturevalue(fixture_name)
image_framework, image_framework_version = get_framework_and_version_from_tag(img_uri)
return image_framework, image_framework_version
return None, None


def _check_telemetry_skip(request, test_type):
"""Common logic for skipping telemetry tests."""
if test_type not in TELEMETRY_SKIP_VERSIONS:
return
image_framework, image_framework_version = _get_telemetry_image_info(request)
if not image_framework:
return
if image_framework not in TELEMETRY_SKIP_VERSIONS[test_type]:
return

if image_framework_version in TELEMETRY_SKIP_VERSIONS[test_type][image_framework]:
pytest.skip(
f"Telemetry {test_type} test is not supported for "
f"{image_framework} version {image_framework_version}"
)


def _validate_pytorch_framework_version(request, image_uri, test_name, skip_dict):
"""
Expected format of skip_dic:
Expand Down

This file was deleted.

Loading