Skip to content

Commit 67f535d

Browse files
beniericpintaoz-aws
authored andcommitted
update notebooks (#1588)
1 parent 6b90f89 commit 67f535d

File tree

2 files changed

+90
-87
lines changed

2 files changed

+90
-87
lines changed

src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb

Lines changed: 49 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,25 @@
66
"metadata": {},
77
"outputs": [],
88
"source": [
9-
"import sys, os\n",
10-
"\n",
11-
"# Get the absolute path of the root directory\n",
12-
"root_dir = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n",
13-
"sys.path.insert(0, root_dir)"
9+
"alias = \"<alias>\""
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"! pip install \"pydantic>=2.0.0\" sagemaker-core"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {},
25+
"outputs": [],
26+
"source": [
27+
"! pip install sagemaker-2.232.4.dev0.tar.gz"
1428
]
1529
},
1630
{
@@ -37,6 +51,7 @@
3751
"model_trainer = ModelTrainer(\n",
3852
" training_image=pytorch_image,\n",
3953
" source_code=source_code,\n",
54+
" base_job_name=f\"{alias}-miminal-case\",\n",
4055
")"
4156
]
4257
},
@@ -46,14 +61,7 @@
4661
"metadata": {},
4762
"outputs": [],
4863
"source": [
49-
"model_trainer.train(wait=False)"
50-
]
51-
},
52-
{
53-
"cell_type": "markdown",
54-
"metadata": {},
55-
"source": [
56-
"Successful Run - https://tiny.amazon.com/3maxeyb/IsenLink"
64+
"model_trainer.train()"
5765
]
5866
},
5967
{
@@ -82,16 +90,10 @@
8290
"model_trainer = ModelTrainer(\n",
8391
" training_image=pytorch_image,\n",
8492
" source_code=source_code,\n",
93+
" base_job_name=f\"{alias}-simple-case-1\",\n",
8594
")\n",
8695
"\n",
87-
"model_trainer.train(wait=False)"
88-
]
89-
},
90-
{
91-
"cell_type": "markdown",
92-
"metadata": {},
93-
"source": [
94-
"Successful Run - https://tiny.amazon.com/6uy7pmpj/IsenLink"
96+
"model_trainer.train()"
9597
]
9698
},
9799
{
@@ -118,6 +120,7 @@
118120
"model_trainer = ModelTrainer(\n",
119121
" training_image=pytorch_image,\n",
120122
" source_code=source_code,\n",
123+
" base_job_name=f\"{alias}-simple-case-2\",\n",
121124
")"
122125
]
123126
},
@@ -127,14 +130,7 @@
127130
"metadata": {},
128131
"outputs": [],
129132
"source": [
130-
"model_trainer.train(wait=False)"
131-
]
132-
},
133-
{
134-
"cell_type": "markdown",
135-
"metadata": {},
136-
"source": [
137-
"Successful Run - https://tiny.amazon.com/7n4n4ogr/IsenLink"
133+
"model_trainer.train()"
138134
]
139135
},
140136
{
@@ -159,7 +155,11 @@
159155
"metadata": {},
160156
"outputs": [],
161157
"source": [
162-
"access_token = os.environ.get(\"HF_TOKEN\", \"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\")\n",
158+
"import os\n",
159+
"\n",
160+
"access_token = \"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\"\n",
161+
"os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = access_token\n",
162+
"\n",
163163
"model_id = \"meta-llama/Llama-2-7b-hf\"\n",
164164
"\n",
165165
"dataset_name = \"tatsu-lab/alpaca\""
@@ -276,7 +276,7 @@
276276
"cell_type": "markdown",
277277
"metadata": {},
278278
"source": [
279-
"#### Model Trainer Torchrun"
279+
"#### Model Trainer Torchrun - Manual"
280280
]
281281
},
282282
{
@@ -298,8 +298,8 @@
298298
"env[\"RDMAV_FORK_SAFE\"] = \"1\"\n",
299299
"\n",
300300
"compute = Compute(\n",
301-
" instance_count=2,\n",
302-
" instance_type=\"ml.p4d.24xlarge\",\n",
301+
" instance_count=1,\n",
302+
" instance_type=\"ml.g5.48xlarge\",\n",
303303
" volume_size_in_gb=96,\n",
304304
" keep_alive_period_in_seconds=3600\n",
305305
")\n",
@@ -329,7 +329,7 @@
329329
"source_code = SourceCode(\n",
330330
" source_dir=\"distributed-training/scripts\",\n",
331331
" requirements=\"requirements.txt\",\n",
332-
" command=\"torchrun --nnodes 2 \\\n",
332+
" command=\"torchrun --nnodes 1 \\\n",
333333
" --nproc_per_node 8 \\\n",
334334
" --master_addr algo-1 \\\n",
335335
" --master_port 7777 \\\n",
@@ -343,6 +343,7 @@
343343
" environment=env,\n",
344344
" hyperparameters=hyperparameters,\n",
345345
" source_code=source_code,\n",
346+
" base_job_name=f\"{alias}-distributed-case\",\n",
346347
")"
347348
]
348349
},
@@ -356,14 +357,14 @@
356357
" channel_name=\"dataset\",\n",
357358
" data_source=training_input_path,\n",
358359
")\n",
359-
"model_trainer.train(input_data_config=[test_data], wait=False)"
360+
"model_trainer.train(input_data_config=[test_data])"
360361
]
361362
},
362363
{
363364
"cell_type": "markdown",
364365
"metadata": {},
365366
"source": [
366-
"Successful Run - https://tiny.amazon.com/10wljn1yu/IsenLink"
367+
"#### Model Trainer Torchrun - Abstractions"
367368
]
368369
},
369370
{
@@ -379,7 +380,7 @@
379380
"\n",
380381
"compute = Compute(\n",
381382
" instance_count=2,\n",
382-
" instance_type=\"ml.p4d.24xlarge\",\n",
383+
" instance_type=\"ml.g5.48xlarge\",\n",
383384
" volume_size_in_gb=96,\n",
384385
" keep_alive_period_in_seconds=3600\n",
385386
")\n",
@@ -445,6 +446,7 @@
445446
" hyperparameters=hyperparameters,\n",
446447
" source_code=source_code,\n",
447448
" distributed_runner=mpi,\n",
449+
" base_job_name=f\"{alias}-distributed-abstractions\",\n",
448450
")"
449451
]
450452
},
@@ -458,7 +460,7 @@
458460
" channel_name=\"dataset\",\n",
459461
" data_source=training_input_path,\n",
460462
")\n",
461-
"model_trainer.train(input_data_config=[test_data], wait=False)"
463+
"model_trainer.train(input_data_config=[test_data])"
462464
]
463465
},
464466
{
@@ -511,13 +513,14 @@
511513
" }\n",
512514
"}\n",
513515
"\n",
514-
"training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n",
516+
"training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n",
515517
"\n",
516518
"model_trainer = ModelTrainer.from_recipe(\n",
517519
" training_recipe=\"training/llama/hf_llama3_8b_seq8192_gpu\",\n",
518520
" training_image=training_image,\n",
519521
" recipe_overrides=recipe_overrides,\n",
520-
" compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n",
522+
" compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n",
523+
" base_job_name=f\"{alias}-recipe-case-1\",\n",
521524
")"
522525
]
523526
},
@@ -530,13 +533,6 @@
530533
"model_trainer.train()"
531534
]
532535
},
533-
{
534-
"cell_type": "markdown",
535-
"metadata": {},
536-
"source": [
537-
"Successful Run - https://tiny.amazon.com/14jxjrndx/IsenLink"
538-
]
539-
},
540536
{
541537
"cell_type": "markdown",
542538
"metadata": {},
@@ -553,12 +549,13 @@
553549
"from sagemaker.modules.train import ModelTrainer\n",
554550
"from sagemaker.modules.configs import Compute\n",
555551
"\n",
556-
"training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n",
552+
"training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n",
557553
"\n",
558554
"model_trainer = ModelTrainer.from_recipe(\n",
559555
" training_recipe=\"recipes/custom-recipe.yaml\",\n",
560556
" training_image=training_image,\n",
561-
" compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n",
557+
" compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n",
558+
" base_job_name=f\"{alias}-recipe-case-2\",\n",
562559
")"
563560
]
564561
},
@@ -571,13 +568,6 @@
571568
"model_trainer.train()"
572569
]
573570
},
574-
{
575-
"cell_type": "markdown",
576-
"metadata": {},
577-
"source": [
578-
"Successful Run - https://tiny.amazon.com/dimbimx1/IsenLink"
579-
]
580-
},
581571
{
582572
"cell_type": "markdown",
583573
"metadata": {},
@@ -624,7 +614,7 @@
624614
" \"FI_EFA_FORK_SAFE\": \"1\"\n",
625615
"}\n",
626616
"\n",
627-
"training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n",
617+
"training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n",
628618
"\n",
629619
"model_trainer = ModelTrainer.from_recipe(\n",
630620
" training_recipe=\"https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/main/examples/conf/hf_llama3_8B_config.yaml\",\n",
@@ -635,7 +625,7 @@
635625
" instance_count=2,\n",
636626
" ),\n",
637627
" stopping_condition=StoppingCondition(\n",
638-
" max_runtime_in_seconds=86400\n",
628+
" max_runtime_in_seconds=3600\n",
639629
" ),\n",
640630
" environment=env\n",
641631
")"
@@ -649,18 +639,11 @@
649639
"source": [
650640
"train = InputData(\n",
651641
" channel_name=\"train\",\n",
652-
" data_source=\"s3://sagemaker-recipes-059094755717-data/data_llama3/\",\n",
642+
" data_source=\"s3://sagemaker-recipes-438465156666-data/data_llama3/\",\n",
653643
")\n",
654644
"\n",
655645
"model_trainer.train(input_data_config=[train], wait=False)"
656646
]
657-
},
658-
{
659-
"cell_type": "markdown",
660-
"metadata": {},
661-
"source": [
662-
"Successful Run - https://tiny.amazon.com/125zldym8/IsenLink"
663-
]
664647
}
665648
],
666649
"metadata": {

0 commit comments

Comments
 (0)