|
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
9 |
| - "import sys, os\n", |
10 |
| - "\n", |
11 |
| - "# Get the absolute path of the root directory\n", |
12 |
| - "root_dir = os.path.abspath(os.path.join(os.getcwd(), \"../../..\"))\n", |
13 |
| - "sys.path.insert(0, root_dir)" |
| 9 | + "alias = \"<alias>\"" |
| 10 | + ] |
| 11 | + }, |
| 12 | + { |
| 13 | + "cell_type": "code", |
| 14 | + "execution_count": null, |
| 15 | + "metadata": {}, |
| 16 | + "outputs": [], |
| 17 | + "source": [ |
| 18 | + "! pip install \"pydantic>=2.0.0\" sagemaker-core" |
| 19 | + ] |
| 20 | + }, |
| 21 | + { |
| 22 | + "cell_type": "code", |
| 23 | + "execution_count": null, |
| 24 | + "metadata": {}, |
| 25 | + "outputs": [], |
| 26 | + "source": [ |
| 27 | + "! pip install sagemaker-2.232.4.dev0.tar.gz" |
14 | 28 | ]
|
15 | 29 | },
|
16 | 30 | {
|
|
37 | 51 | "model_trainer = ModelTrainer(\n",
|
38 | 52 | " training_image=pytorch_image,\n",
|
39 | 53 | " source_code=source_code,\n",
|
| 54 | + " base_job_name=f\"{alias}-miminal-case\",\n", |
40 | 55 | ")"
|
41 | 56 | ]
|
42 | 57 | },
|
|
46 | 61 | "metadata": {},
|
47 | 62 | "outputs": [],
|
48 | 63 | "source": [
|
49 |
| - "model_trainer.train(wait=False)" |
50 |
| - ] |
51 |
| - }, |
52 |
| - { |
53 |
| - "cell_type": "markdown", |
54 |
| - "metadata": {}, |
55 |
| - "source": [ |
56 |
| - "Successful Run - https://tiny.amazon.com/3maxeyb/IsenLink" |
| 64 | + "model_trainer.train()" |
57 | 65 | ]
|
58 | 66 | },
|
59 | 67 | {
|
|
82 | 90 | "model_trainer = ModelTrainer(\n",
|
83 | 91 | " training_image=pytorch_image,\n",
|
84 | 92 | " source_code=source_code,\n",
|
| 93 | + " base_job_name=f\"{alias}-simple-case-1\",\n", |
85 | 94 | ")\n",
|
86 | 95 | "\n",
|
87 |
| - "model_trainer.train(wait=False)" |
88 |
| - ] |
89 |
| - }, |
90 |
| - { |
91 |
| - "cell_type": "markdown", |
92 |
| - "metadata": {}, |
93 |
| - "source": [ |
94 |
| - "Successful Run - https://tiny.amazon.com/6uy7pmpj/IsenLink" |
| 96 | + "model_trainer.train()" |
95 | 97 | ]
|
96 | 98 | },
|
97 | 99 | {
|
|
118 | 120 | "model_trainer = ModelTrainer(\n",
|
119 | 121 | " training_image=pytorch_image,\n",
|
120 | 122 | " source_code=source_code,\n",
|
| 123 | + " base_job_name=f\"{alias}-simple-case-2\",\n", |
121 | 124 | ")"
|
122 | 125 | ]
|
123 | 126 | },
|
|
127 | 130 | "metadata": {},
|
128 | 131 | "outputs": [],
|
129 | 132 | "source": [
|
130 |
| - "model_trainer.train(wait=False)" |
131 |
| - ] |
132 |
| - }, |
133 |
| - { |
134 |
| - "cell_type": "markdown", |
135 |
| - "metadata": {}, |
136 |
| - "source": [ |
137 |
| - "Successful Run - https://tiny.amazon.com/7n4n4ogr/IsenLink" |
| 133 | + "model_trainer.train()" |
138 | 134 | ]
|
139 | 135 | },
|
140 | 136 | {
|
|
159 | 155 | "metadata": {},
|
160 | 156 | "outputs": [],
|
161 | 157 | "source": [
|
162 |
| - "access_token = os.environ.get(\"HF_TOKEN\", \"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\")\n", |
| 158 | + "import os\n", |
| 159 | + "\n", |
| 160 | + "access_token = \"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\"\n", |
| 161 | + "os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = access_token\n", |
| 162 | + "\n", |
163 | 163 | "model_id = \"meta-llama/Llama-2-7b-hf\"\n",
|
164 | 164 | "\n",
|
165 | 165 | "dataset_name = \"tatsu-lab/alpaca\""
|
|
276 | 276 | "cell_type": "markdown",
|
277 | 277 | "metadata": {},
|
278 | 278 | "source": [
|
279 |
| - "#### Model Trainer Torchrun" |
| 279 | + "#### Model Trainer Torchrun - Manual" |
280 | 280 | ]
|
281 | 281 | },
|
282 | 282 | {
|
|
298 | 298 | "env[\"RDMAV_FORK_SAFE\"] = \"1\"\n",
|
299 | 299 | "\n",
|
300 | 300 | "compute = Compute(\n",
|
301 |
| - " instance_count=2,\n", |
302 |
| - " instance_type=\"ml.p4d.24xlarge\",\n", |
| 301 | + " instance_count=1,\n", |
| 302 | + " instance_type=\"ml.g5.48xlarge\",\n", |
303 | 303 | " volume_size_in_gb=96,\n",
|
304 | 304 | " keep_alive_period_in_seconds=3600\n",
|
305 | 305 | ")\n",
|
|
329 | 329 | "source_code = SourceCode(\n",
|
330 | 330 | " source_dir=\"distributed-training/scripts\",\n",
|
331 | 331 | " requirements=\"requirements.txt\",\n",
|
332 |
| - " command=\"torchrun --nnodes 2 \\\n", |
| 332 | + " command=\"torchrun --nnodes 1 \\\n", |
333 | 333 | " --nproc_per_node 8 \\\n",
|
334 | 334 | " --master_addr algo-1 \\\n",
|
335 | 335 | " --master_port 7777 \\\n",
|
|
343 | 343 | " environment=env,\n",
|
344 | 344 | " hyperparameters=hyperparameters,\n",
|
345 | 345 | " source_code=source_code,\n",
|
| 346 | + " base_job_name=f\"{alias}-distributed-case\",\n", |
346 | 347 | ")"
|
347 | 348 | ]
|
348 | 349 | },
|
|
356 | 357 | " channel_name=\"dataset\",\n",
|
357 | 358 | " data_source=training_input_path,\n",
|
358 | 359 | ")\n",
|
359 |
| - "model_trainer.train(input_data_config=[test_data], wait=False)" |
| 360 | + "model_trainer.train(input_data_config=[test_data])" |
360 | 361 | ]
|
361 | 362 | },
|
362 | 363 | {
|
363 | 364 | "cell_type": "markdown",
|
364 | 365 | "metadata": {},
|
365 | 366 | "source": [
|
366 |
| - "Successful Run - https://tiny.amazon.com/10wljn1yu/IsenLink" |
| 367 | + "#### Model Trainer Torchrun - Abstractions" |
367 | 368 | ]
|
368 | 369 | },
|
369 | 370 | {
|
|
379 | 380 | "\n",
|
380 | 381 | "compute = Compute(\n",
|
381 | 382 | " instance_count=2,\n",
|
382 |
| - " instance_type=\"ml.p4d.24xlarge\",\n", |
| 383 | + " instance_type=\"ml.g5.48xlarge\",\n", |
383 | 384 | " volume_size_in_gb=96,\n",
|
384 | 385 | " keep_alive_period_in_seconds=3600\n",
|
385 | 386 | ")\n",
|
|
445 | 446 | " hyperparameters=hyperparameters,\n",
|
446 | 447 | " source_code=source_code,\n",
|
447 | 448 | " distributed_runner=mpi,\n",
|
| 449 | + " base_job_name=f\"{alias}-distributed-abstractions\",\n", |
448 | 450 | ")"
|
449 | 451 | ]
|
450 | 452 | },
|
|
458 | 460 | " channel_name=\"dataset\",\n",
|
459 | 461 | " data_source=training_input_path,\n",
|
460 | 462 | ")\n",
|
461 |
| - "model_trainer.train(input_data_config=[test_data], wait=False)" |
| 463 | + "model_trainer.train(input_data_config=[test_data])" |
462 | 464 | ]
|
463 | 465 | },
|
464 | 466 | {
|
|
511 | 513 | " }\n",
|
512 | 514 | "}\n",
|
513 | 515 | "\n",
|
514 |
| - "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
| 516 | + "training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
515 | 517 | "\n",
|
516 | 518 | "model_trainer = ModelTrainer.from_recipe(\n",
|
517 | 519 | " training_recipe=\"training/llama/hf_llama3_8b_seq8192_gpu\",\n",
|
518 | 520 | " training_image=training_image,\n",
|
519 | 521 | " recipe_overrides=recipe_overrides,\n",
|
520 |
| - " compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n", |
| 522 | + " compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n", |
| 523 | + " base_job_name=f\"{alias}-recipe-case-1\",\n", |
521 | 524 | ")"
|
522 | 525 | ]
|
523 | 526 | },
|
|
530 | 533 | "model_trainer.train()"
|
531 | 534 | ]
|
532 | 535 | },
|
533 |
| - { |
534 |
| - "cell_type": "markdown", |
535 |
| - "metadata": {}, |
536 |
| - "source": [ |
537 |
| - "Successful Run - https://tiny.amazon.com/14jxjrndx/IsenLink" |
538 |
| - ] |
539 |
| - }, |
540 | 536 | {
|
541 | 537 | "cell_type": "markdown",
|
542 | 538 | "metadata": {},
|
|
553 | 549 | "from sagemaker.modules.train import ModelTrainer\n",
|
554 | 550 | "from sagemaker.modules.configs import Compute\n",
|
555 | 551 | "\n",
|
556 |
| - "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
| 552 | + "training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
557 | 553 | "\n",
|
558 | 554 | "model_trainer = ModelTrainer.from_recipe(\n",
|
559 | 555 | " training_recipe=\"recipes/custom-recipe.yaml\",\n",
|
560 | 556 | " training_image=training_image,\n",
|
561 |
| - " compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n", |
| 557 | + " compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n", |
| 558 | + " base_job_name=f\"{alias}-recipe-case-2\",\n", |
562 | 559 | ")"
|
563 | 560 | ]
|
564 | 561 | },
|
|
571 | 568 | "model_trainer.train()"
|
572 | 569 | ]
|
573 | 570 | },
|
574 |
| - { |
575 |
| - "cell_type": "markdown", |
576 |
| - "metadata": {}, |
577 |
| - "source": [ |
578 |
| - "Successful Run - https://tiny.amazon.com/dimbimx1/IsenLink" |
579 |
| - ] |
580 |
| - }, |
581 | 571 | {
|
582 | 572 | "cell_type": "markdown",
|
583 | 573 | "metadata": {},
|
|
624 | 614 | " \"FI_EFA_FORK_SAFE\": \"1\"\n",
|
625 | 615 | "}\n",
|
626 | 616 | "\n",
|
627 |
| - "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n", |
| 617 | + "training_image = \"438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n", |
628 | 618 | "\n",
|
629 | 619 | "model_trainer = ModelTrainer.from_recipe(\n",
|
630 | 620 | " training_recipe=\"https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/main/examples/conf/hf_llama3_8B_config.yaml\",\n",
|
|
635 | 625 | " instance_count=2,\n",
|
636 | 626 | " ),\n",
|
637 | 627 | " stopping_condition=StoppingCondition(\n",
|
638 |
| - " max_runtime_in_seconds=86400\n", |
| 628 | + " max_runtime_in_seconds=3600\n", |
639 | 629 | " ),\n",
|
640 | 630 | " environment=env\n",
|
641 | 631 | ")"
|
|
649 | 639 | "source": [
|
650 | 640 | "train = InputData(\n",
|
651 | 641 | " channel_name=\"train\",\n",
|
652 |
| - " data_source=\"s3://sagemaker-recipes-059094755717-data/data_llama3/\",\n", |
| 642 | + " data_source=\"s3://sagemaker-recipes-438465156666-data/data_llama3/\",\n", |
653 | 643 | ")\n",
|
654 | 644 | "\n",
|
655 | 645 | "model_trainer.train(input_data_config=[train], wait=False)"
|
656 | 646 | ]
|
657 |
| - }, |
658 |
| - { |
659 |
| - "cell_type": "markdown", |
660 |
| - "metadata": {}, |
661 |
| - "source": [ |
662 |
| - "Successful Run - https://tiny.amazon.com/125zldym8/IsenLink" |
663 |
| - ] |
664 | 647 | }
|
665 | 648 | ],
|
666 | 649 | "metadata": {
|
|
0 commit comments