|
6 | 6 | "metadata": {}, |
7 | 7 | "outputs": [], |
8 | 8 | "source": [ |
9 | | - "import sys, os\n", |
10 | | - "\n", |
11 | | - "# Get the absolute path of the root directory\n", |
12 | | - "root_dir = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n", |
13 | | - "sys.path.insert(0, root_dir)" |
| 9 | + "alias = \"<alias>\"" |
| 10 | + ] |
| 11 | + }, |
| 12 | + { |
| 13 | + "cell_type": "code", |
| 14 | + "execution_count": null, |
| 15 | + "metadata": {}, |
| 16 | + "outputs": [], |
| 17 | + "source": [ |
| 18 | + "! pip install \"pydantic>=2.0.0\" sagemaker-core" |
| 19 | + ] |
| 20 | + }, |
| 21 | + { |
| 22 | + "cell_type": "code", |
| 23 | + "execution_count": null, |
| 24 | + "metadata": {}, |
| 25 | + "outputs": [], |
| 26 | + "source": [ |
| 27 | + "! pip install sagemaker-2.232.4.dev0.tar.gz" |
14 | 28 | ] |
15 | 29 | }, |
16 | 30 | { |
|
37 | 51 | "model_trainer = ModelTrainer(\n", |
38 | 52 | " training_image=pytorch_image,\n", |
39 | 53 | " source_code=source_code,\n", |
| 54 | + " base_job_name=f\"{alias}-miminal-case\",\n", |
40 | 55 | ")" |
41 | 56 | ] |
42 | 57 | }, |
|
46 | 61 | "metadata": {}, |
47 | 62 | "outputs": [], |
48 | 63 | "source": [ |
49 | | - "model_trainer.train(wait=False)" |
50 | | - ] |
51 | | - }, |
52 | | - { |
53 | | - "cell_type": "markdown", |
54 | | - "metadata": {}, |
55 | | - "source": [ |
56 | | - "Successful Run - https://tiny.amazon.com/3maxeyb/IsenLink" |
| 64 | + "model_trainer.train()" |
57 | 65 | ] |
58 | 66 | }, |
59 | 67 | { |
|
82 | 90 | "model_trainer = ModelTrainer(\n", |
83 | 91 | " training_image=pytorch_image,\n", |
84 | 92 | " source_code=source_code,\n", |
| 93 | + " base_job_name=f\"{alias}-simple-case-1\",\n", |
85 | 94 | ")\n", |
86 | 95 | "\n", |
87 | | - "model_trainer.train(wait=False)" |
88 | | - ] |
89 | | - }, |
90 | | - { |
91 | | - "cell_type": "markdown", |
92 | | - "metadata": {}, |
93 | | - "source": [ |
94 | | - "Successful Run - https://tiny.amazon.com/6uy7pmpj/IsenLink" |
| 96 | + "model_trainer.train()" |
95 | 97 | ] |
96 | 98 | }, |
97 | 99 | { |
|
118 | 120 | "model_trainer = ModelTrainer(\n", |
119 | 121 | " training_image=pytorch_image,\n", |
120 | 122 | " source_code=source_code,\n", |
| 123 | + " base_job_name=f\"{alias}-simple-case-2\",\n", |
121 | 124 | ")" |
122 | 125 | ] |
123 | 126 | }, |
|
127 | 130 | "metadata": {}, |
128 | 131 | "outputs": [], |
129 | 132 | "source": [ |
130 | | - "model_trainer.train(wait=False)" |
131 | | - ] |
132 | | - }, |
133 | | - { |
134 | | - "cell_type": "markdown", |
135 | | - "metadata": {}, |
136 | | - "source": [ |
137 | | - "Successful Run - https://tiny.amazon.com/7n4n4ogr/IsenLink" |
| 133 | + "model_trainer.train()" |
138 | 134 | ] |
139 | 135 | }, |
140 | 136 | { |
|
159 | 155 | "metadata": {}, |
160 | 156 | "outputs": [], |
161 | 157 | "source": [ |
162 | | - "access_token = os.environ.get(\"HF_TOKEN\", \"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\")\n", |
| 158 | + "import os\n", |
| 159 | + "\n", |
| 160 | + "access_token = \"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\"\n", |
| 161 | + "os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = access_token\n", |
| 162 | + "\n", |
163 | 163 | "model_id = \"meta-llama/Llama-2-7b-hf\"\n", |
164 | 164 | "\n", |
165 | 165 | "dataset_name = \"tatsu-lab/alpaca\"" |
|
276 | 276 | "cell_type": "markdown", |
277 | 277 | "metadata": {}, |
278 | 278 | "source": [ |
279 | | - "#### Model Trainer Torchrun" |
| 279 | + "#### Model Trainer Torchrun - Manual" |
280 | 280 | ] |
281 | 281 | }, |
282 | 282 | { |
|
298 | 298 | "env[\"RDMAV_FORK_SAFE\"] = \"1\"\n", |
299 | 299 | "\n", |
300 | 300 | "compute = Compute(\n", |
301 | | - " instance_count=2,\n", |
302 | | - " instance_type=\"ml.p4d.24xlarge\",\n", |
| 301 | + " instance_count=1,\n", |
| 302 | + " instance_type=\"ml.g5.48xlarge\",\n", |
303 | 303 | " volume_size_in_gb=96,\n", |
304 | 304 | " keep_alive_period_in_seconds=3600\n", |
305 | 305 | ")\n", |
|
329 | 329 | "source_code = SourceCode(\n", |
330 | 330 | " source_dir=\"distributed-training/scripts\",\n", |
331 | 331 | " requirements=\"requirements.txt\",\n", |
332 | | - " command=\"torchrun --nnodes 2 \\\n", |
| 332 | + " command=\"torchrun --nnodes 1 \\\n", |
333 | 333 | " --nproc_per_node 8 \\\n", |
334 | 334 | " --master_addr algo-1 \\\n", |
335 | 335 | " --master_port 7777 \\\n", |
|
343 | 343 | " environment=env,\n", |
344 | 344 | " hyperparameters=hyperparameters,\n", |
345 | 345 | " source_code=source_code,\n", |
| 346 | + " base_job_name=f\"{alias}-distributed-case\",\n", |
346 | 347 | ")" |
347 | 348 | ] |
348 | 349 | }, |
|
356 | 357 | " channel_name=\"dataset\",\n", |
357 | 358 | " data_source=training_input_path,\n", |
358 | 359 | ")\n", |
359 | | - "model_trainer.train(input_data_config=[test_data], wait=False)" |
| 360 | + "model_trainer.train(input_data_config=[test_data])" |
360 | 361 | ] |
361 | 362 | }, |
362 | 363 | { |
363 | 364 | "cell_type": "markdown", |
364 | 365 | "metadata": {}, |
365 | 366 | "source": [ |
366 | | - "Successful Run - https://tiny.amazon.com/10wljn1yu/IsenLink" |
| 367 | + "#### Model Trainer Torchrun - Abstractions" |
367 | 368 | ] |
368 | 369 | }, |
369 | 370 | { |
|
379 | 380 | "\n", |
380 | 381 | "compute = Compute(\n", |
381 | 382 | " instance_count=2,\n", |
382 | | - " instance_type=\"ml.p4d.24xlarge\",\n", |
| 383 | + " instance_type=\"ml.g5.48xlarge\",\n", |
383 | 384 | " volume_size_in_gb=96,\n", |
384 | 385 | " keep_alive_period_in_seconds=3600\n", |
385 | 386 | ")\n", |
|
445 | 446 | " hyperparameters=hyperparameters,\n", |
446 | 447 | " source_code=source_code,\n", |
447 | 448 | " distributed_runner=mpi,\n", |
| 449 | + " base_job_name=f\"{alias}-distributed-abstractions\",\n", |
448 | 450 | ")" |
449 | 451 | ] |
450 | 452 | }, |
|
458 | 460 | " channel_name=\"dataset\",\n", |
459 | 461 | " data_source=training_input_path,\n", |
460 | 462 | ")\n", |
461 | | - "model_trainer.train(input_data_config=[test_data], wait=False)" |
| 463 | + "model_trainer.train(input_data_config=[test_data])" |
462 | 464 | ] |
463 | 465 | }, |
464 | 466 | { |
|
511 | 513 | " }\n", |
512 | 514 | "}\n", |
513 | 515 | "\n", |
514 | | - "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
| 516 | + "training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
515 | 517 | "\n", |
516 | 518 | "model_trainer = ModelTrainer.from_recipe(\n", |
517 | 519 | " training_recipe=\"training/llama/hf_llama3_8b_seq8192_gpu\",\n", |
518 | 520 | " training_image=training_image,\n", |
519 | 521 | " recipe_overrides=recipe_overrides,\n", |
520 | | - " compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n", |
| 522 | + " compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n", |
| 523 | + " base_job_name=f\"{alias}-recipe-case-1\",\n", |
521 | 524 | ")" |
522 | 525 | ] |
523 | 526 | }, |
|
530 | 533 | "model_trainer.train()" |
531 | 534 | ] |
532 | 535 | }, |
533 | | - { |
534 | | - "cell_type": "markdown", |
535 | | - "metadata": {}, |
536 | | - "source": [ |
537 | | - "Successful Run - https://tiny.amazon.com/14jxjrndx/IsenLink" |
538 | | - ] |
539 | | - }, |
540 | 536 | { |
541 | 537 | "cell_type": "markdown", |
542 | 538 | "metadata": {}, |
|
553 | 549 | "from sagemaker.modules.train import ModelTrainer\n", |
554 | 550 | "from sagemaker.modules.configs import Compute\n", |
555 | 551 | "\n", |
556 | | - "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
| 552 | + "training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
557 | 553 | "\n", |
558 | 554 | "model_trainer = ModelTrainer.from_recipe(\n", |
559 | 555 | " training_recipe=\"recipes/custom-recipe.yaml\",\n", |
560 | 556 | " training_image=training_image,\n", |
561 | | - " compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n", |
| 557 | + " compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n", |
| 558 | + " base_job_name=f\"{alias}-recipe-case-2\",\n", |
562 | 559 | ")" |
563 | 560 | ] |
564 | 561 | }, |
|
571 | 568 | "model_trainer.train()" |
572 | 569 | ] |
573 | 570 | }, |
574 | | - { |
575 | | - "cell_type": "markdown", |
576 | | - "metadata": {}, |
577 | | - "source": [ |
578 | | - "Successful Run - https://tiny.amazon.com/dimbimx1/IsenLink" |
579 | | - ] |
580 | | - }, |
581 | 571 | { |
582 | 572 | "cell_type": "markdown", |
583 | 573 | "metadata": {}, |
|
624 | 614 | " \"FI_EFA_FORK_SAFE\": \"1\"\n", |
625 | 615 | "}\n", |
626 | 616 | "\n", |
627 | | - "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n", |
| 617 | + "training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n", |
628 | 618 | "\n", |
629 | 619 | "model_trainer = ModelTrainer.from_recipe(\n", |
630 | 620 | " training_recipe=\"https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/main/examples/conf/hf_llama3_8B_config.yaml\",\n", |
|
635 | 625 | " instance_count=2,\n", |
636 | 626 | " ),\n", |
637 | 627 | " stopping_condition=StoppingCondition(\n", |
638 | | - " max_runtime_in_seconds=86400\n", |
| 628 | + " max_runtime_in_seconds=3600\n", |
639 | 629 | " ),\n", |
640 | 630 | " environment=env\n", |
641 | 631 | ")" |
|
649 | 639 | "source": [ |
650 | 640 | "train = InputData(\n", |
651 | 641 | " channel_name=\"train\",\n", |
652 | | - " data_source=\"s3://sagemaker-recipes-059094755717-data/data_llama3/\",\n", |
| 642 | + " data_source=\"s3://sagemaker-recipes-438465156666-data/data_llama3/\",\n", |
653 | 643 | ")\n", |
654 | 644 | "\n", |
655 | 645 | "model_trainer.train(input_data_config=[train], wait=False)" |
656 | 646 | ] |
657 | | - }, |
658 | | - { |
659 | | - "cell_type": "markdown", |
660 | | - "metadata": {}, |
661 | | - "source": [ |
662 | | - "Successful Run - https://tiny.amazon.com/125zldym8/IsenLink" |
663 | | - ] |
664 | 647 | } |
665 | 648 | ], |
666 | 649 | "metadata": { |
|
0 commit comments