Red-Hat-AI-Innovation-Team · Maxusmusti · Sep 3, 2025 · Aug 29, 2025 · Sep 2, 2025 · Sep 3, 2025
diff --git a/examples/notebooks/lab_multiphase_training_tutorial.ipynb b/examples/notebooks/lab_multiphase_training_tutorial.ipynb
@@ -45,7 +45,22 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## Logging Configuration\n\nSet up logging to prevent notebook crashes from excessive output while still showing essential progress and error information.\n\n**Note:** While this notebook will walk you through a breakdown of all the steps and contains the end-to-end pipeline, we also provide an example script for any significantly long-running jobs for reproducibility, flexibility, and logging consistency in case of notebook disconnects. You can find the script at `scripts/lab_multiphase_training.py`.\n\n**Quick script usage:**\n```bash\npython scripts/lab_multiphase_training.py \\\n  --base-model-path /path/to/model \\\n  --phase07-data-path /path/to/knowledge.jsonl \\\n  --phase10-data-path /path/to/skills_replay.jsonl \\\n  --ckpt-output-base-dir /path/to/checkpoints\n```"
+   "source": [
+    "## Logging Configuration\n",
+    "\n",
+    "Set up logging to prevent notebook crashes from excessive output while still showing essential progress and error information.\n",
+    "\n",
+    "**Note:** While this notebook will walk you through a breakdown of all the steps and contains the end-to-end pipeline, we also provide an example script for any significantly long-running jobs for reproducibility, flexibility, and logging consistency in case of notebook disconnects. You can find the script at `scripts/lab_multiphase_training.py`.\n",
+    "\n",
+    "**Quick script usage:**\n",
+    "```bash\n",
+    "python scripts/lab_multiphase_training.py \\\n",
+    "  --base-model-path /path/to/model \\\n",
+    "  --phase07-data-path /path/to/knowledge.jsonl \\\n",
+    "  --phase10-data-path /path/to/skills_replay.jsonl \\\n",
+    "  --ckpt-output-base-dir /path/to/checkpoints\n",
+    "```"
+   ]
   },
   {
    "cell_type": "code",
@@ -147,7 +162,40 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# LAB Multi-Phase Training Configuration\nexperiment_prefix = \"lab_multiphase_training_demo\"\nckpt_output_base_dir = \"/path/to/your/checkpoints\"  # Update this path\n\n# Model and data paths - Update these to your actual paths\nbase_model_path = \"/path/to/your/base/model\"  # e.g., granite-3.1-8b-starter-v2.1\nphase07_data_path = \"/path/to/knowledge_data.jsonl\"  # Knowledge/facts data for Phase07\nphase10_data_path = \"/path/to/skills_plus_replay_data.jsonl\"  # Skills + replay data for Phase10\n# Note: Phase10 data should include:\n# - New skills/task data\n# - Replay of Phase07 knowledge data  \n# - Replay of base model's original instruction tuning data\n\n# Training hyperparameters\nmax_tokens_per_gpu = 25_000  # Memory limit per GPU (reduce if hitting OOM errors)\nmax_seq_len = 20_000         # Maximum sequence length\n\n# Distributed training setup (adjust for your hardware)\nnproc_per_node = 8  # Number of GPUs per node\nnnodes = 1          # Number of nodes\nnode_rank = 0       # This node's rank\nrdzv_id = 420       # Rendezvous ID\nrdzv_endpoint = \"0.0.0.0:12345\"  # Master endpoint\n\nprint(f\"LAB Multi-Phase Experiment: {experiment_prefix}\")\nprint(f\"Output directory: {ckpt_output_base_dir}\")\nprint(f\"GPUs per node: {nproc_per_node}\")\nprint(f\"Max tokens per GPU: {max_tokens_per_gpu:,}\")\nprint(f\"\\nData composition:\")\nprint(f\"  Phase07: Knowledge data only\")\nprint(f\"  Phase10: Skills + Phase07 replay + Base model instruction replay\")\nprint(f\"\\n💡 Note: If you encounter OOM (Out of Memory) errors, reduce max_tokens_per_gpu\")"
+   "source": [
+    "# LAB Multi-Phase Training Configuration\n",
+    "experiment_prefix = \"lab_multiphase_training_demo\"\n",
+    "ckpt_output_base_dir = \"/path/to/your/checkpoints\"  # Update this path\n",
+    "\n",
+    "# Model and data paths - Update these to your actual paths\n",
+    "base_model_path = \"/path/to/your/base/model\"  # e.g., granite-3.1-8b-starter-v2.1\n",
+    "phase07_data_path = \"/path/to/knowledge_data.jsonl\"  # Knowledge/facts data for Phase07\n",
+    "phase10_data_path = \"/path/to/skills_plus_replay_data.jsonl\"  # Skills + replay data for Phase10\n",
+    "# Note: Phase10 data should include:\n",
+    "# - New skills/task data\n",
+    "# - Replay of Phase07 knowledge data  \n",
+    "# - Replay of base model's original instruction tuning data\n",
+    "\n",
+    "# Training hyperparameters\n",
+    "max_tokens_per_gpu = 25_000  # Memory limit per GPU (reduce if hitting OOM errors)\n",
+    "max_seq_len = 20_000         # Maximum sequence length\n",
+    "\n",
+    "# Distributed training setup (adjust for your hardware)\n",
+    "nproc_per_node = 8  # Number of GPUs per node\n",
+    "nnodes = 1          # Number of nodes\n",
+    "node_rank = 0       # This node's rank\n",
+    "rdzv_id = 47        # Rendezvous ID\n",
+    "rdzv_endpoint = \"0.0.0.0:12345\"  # Master endpoint\n",
+    "\n",
+    "print(f\"LAB Multi-Phase Experiment: {experiment_prefix}\")\n",
+    "print(f\"Output directory: {ckpt_output_base_dir}\")\n",
+    "print(f\"GPUs per node: {nproc_per_node}\")\n",
+    "print(f\"Max tokens per GPU: {max_tokens_per_gpu:,}\")\n",
+    "print(f\"\\nData composition:\")\n",
+    "print(f\"  Phase07: Knowledge data only\")\n",
+    "print(f\"  Phase10: Skills + Phase07 replay + Base model instruction replay\")\n",
+    "print(f\"\\n💡 Note: If you encounter OOM (Out of Memory) errors, reduce max_tokens_per_gpu\")"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -511,4 +559,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}