Skip to content

Commit fdf2e9a

Browse files
schinmayeepintaoz-aws
authored andcommitted
Neuron URIs update (#1626)
Resolve recipes correctly before launching (#1529) fixes. (#1532) fix recipe path. (#1566)
1 parent 74d6b7c commit fdf2e9a

File tree

5 files changed

+84
-3
lines changed

5 files changed

+84
-3
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
{
2+
"training": {
3+
"processors": [
4+
"neuronx"
5+
],
6+
"version_aliases": {
7+
"2.1.2": "2.1.2"
8+
},
9+
"versions": {
10+
"2.1.2": {
11+
"py_versions": [
12+
"py310"
13+
],
14+
"repository": "pytorch-training-neuronx",
15+
"registries": {
16+
"af-south-1": "626614931356",
17+
"ap-east-1": "871362719292",
18+
"ap-northeast-1": "763104351884",
19+
"ap-northeast-2": "763104351884",
20+
"ap-northeast-3": "364406365360",
21+
"ap-south-1": "763104351884",
22+
"ap-south-2": "772153158452",
23+
"ap-southeast-1": "763104351884",
24+
"ap-southeast-2": "763104351884",
25+
"ap-southeast-3": "907027046896",
26+
"ap-southeast-4": "457447274322",
27+
"eu-central-1": "763104351884",
28+
"eu-central-2": "380420809688",
29+
"eu-north-1": "763104351884",
30+
"eu-south-2": "503227376785",
31+
"eu-west-1": "763104351884",
32+
"eu-west-2": "763104351884",
33+
"eu-west-3": "763104351884",
34+
"il-central-1": "780543022126",
35+
"sa-east-1": "763104351884",
36+
"us-east-1": "763104351884",
37+
"us-east-2": "763104351884",
38+
"us-west-1": "763104351884",
39+
"us-west-2": "763104351884",
40+
"ca-west-1": "204538143572",
41+
"ca-central-1": "763104351884"
42+
},
43+
"container_version": {
44+
"neuronx": "ubuntu20.04"
45+
},
46+
"sdk_versions": [
47+
"sdk2.20.2"
48+
]
49+
}
50+
}
51+
}
52+
}

src/sagemaker/image_uris.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def retrieve(
224224
container_version = version_config["container_version"][processor]
225225

226226
# Append sdk version in case of trainium instances
227-
if repo in ["pytorch-training-neuron"]:
227+
if repo in ["pytorch-training-neuron", "pytorch-training-neuronx"]:
228228
if not sdk_version:
229229
sdk_version = _get_latest_versions(version_config["sdk_versions"])
230230
container_version = sdk_version + "-" + container_version

src/sagemaker/pytorch/estimator.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def __init__(
177177
a directory with any other training source code dependencies aside from the entry
178178
point file (default: None). If ``source_dir`` is an S3 URI, it must
179179
point to a tar.gz file. Structure within this directory are preserved
180-
when training on Amazon SageMaker.
180+
when training on Amazon SageMaker. Must be a local path when using training_recipe.
181181
hyperparameters (dict[str, str] or dict[str, PipelineVariable]): Hyperparameters
182182
that will be used for training (default: None). The hyperparameters are made
183183
accessible as a dict[str, str] to the training code on
@@ -635,6 +635,10 @@ def _setup_for_training_recipe(cls, training_recipe, recipe_overrides, source_di
635635
if source_dir is None:
636636
args["source_dir"] = "."
637637
else:
638+
if not os.path.exists(source_dir):
639+
raise ValueError(
640+
"When using training_recipe, source_dir must be a local directory."
641+
)
638642
args["source_dir"] = source_dir
639643

640644
recipe_name = os.path.splitext(os.path.basename(training_recipe))[0]

src/sagemaker/pytorch/training_recipes.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,9 @@
77
"version": "2.4.1",
88
"additional_args": {}
99
},
10-
"neuron_image": "855988369404.dkr.ecr.us-west-2.amazonaws.com/chinmayee-dev:neuron_sept26_v1"
10+
"neuron_image" : {
11+
"framework": "hyperpod-recipes-neuron",
12+
"version": "2.1.2",
13+
"additional_args": {}
14+
}
1115
}

tests/unit/test_pytorch.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from sagemaker import image_uris
2424
from sagemaker.pytorch import defaults
2525
from sagemaker.pytorch import PyTorch, PyTorchPredictor, PyTorchModel
26+
from sagemaker.pytorch.estimator import _get_training_recipe_image_uri
2627
from sagemaker.instance_group import InstanceGroup
2728
from sagemaker.session_settings import SessionSettings
2829

@@ -67,6 +68,13 @@
6768
"neuronx-distributed-training/refs/heads/main/examples/"
6869
"conf/hf_llama3_8B_config.yaml"
6970
)
71+
RECIPE_GPU_IMAGE = (
72+
"658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311"
73+
)
74+
RECIPE_NEURON_IMAGE = (
75+
"763104351884.dkr.ecr.us-west-2.amazonaws.com/"
76+
"pytorch-training-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
77+
)
7078

7179

7280
@pytest.fixture(name="sagemaker_session")
@@ -1085,3 +1093,16 @@ def test_training_recipe_for_trainium_custom_source_dir(sagemaker_session):
10851093
},
10861094
}
10871095
assert pytorch.distribution == expected_distribution
1096+
1097+
1098+
def test_training_recipe_images_uri():
1099+
gpu_image_cfg = {"framework": "pytorch-smp", "version": "2.4.1", "additional_args": {}}
1100+
gpu_image_uri = _get_training_recipe_image_uri(gpu_image_cfg, "us-west-2")
1101+
assert gpu_image_uri == RECIPE_GPU_IMAGE
1102+
neuron_image_cfg = {
1103+
"framework": "hyperpod-recipes-neuron",
1104+
"version": "2.1.2",
1105+
"additional_args": {},
1106+
}
1107+
neuron_image_uri = _get_training_recipe_image_uri(neuron_image_cfg, "us-west-2")
1108+
assert neuron_image_uri == RECIPE_NEURON_IMAGE

0 commit comments

Comments
 (0)