Red-Hat-AI-Innovation-Team
diff --git a/‎examples/README.md‎
Lines changed: 4 additions & 0 deletions b/‎examples/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/notebooks/lab_multiphase_training_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎examples/notebooks/lab_multiphase_training_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion
@@ -19,9 +19,13 @@ The SFT algorithm supports training language models on supervised datasets with
 
 **Tutorials:**
 - [LAB Multi-Phase Training Tutorial](notebooks/lab_multiphase_training_tutorial.ipynb) - Interactive notebook demonstrating LAB multi-phase training workflow
+- [SFT Comprehensive Tutorial](notebooks/sft_comprehensive_tutorial.ipynb) - Interactive notebook covering all SFT parameters with popular model examples
 
 **Scripts:**
 - [LAB Multi-Phase Training Script](scripts/lab_multiphase_training.py) - Example script for LAB multi-phase training with full command-line interface
+- [SFT with Qwen 2.5 7B](scripts/sft_qwen_example.py) - Single-node multi-GPU training example with Qwen 2.5 7B Instruct
+- [SFT with Llama 3.1 8B](scripts/sft_llama_example.py) - Single-node multi-GPU training example with Llama 3.1 8B Instruct  
+- [SFT with Phi 4 Mini](scripts/sft_phi_example.py) - Single-node multi-GPU training example with Phi 4 Mini Instruct
 
 **Quick Example:**
 ```python
 
@@ -147,7 +147,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# LAB Multi-Phase Training Configuration\nexperiment_prefix = \"lab_multiphase_training_demo\"\nckpt_output_base_dir = \"/path/to/your/checkpoints\"  # Update this path\n\n# Model and data paths - Update these to your actual paths\nbase_model_path = \"/path/to/your/base/model\"  # e.g., granite-3.1-8b-starter-v2.1\nphase07_data_path = \"/path/to/knowledge_data.jsonl\"  # Knowledge/facts data for Phase07\nphase10_data_path = \"/path/to/skills_plus_replay_data.jsonl\"  # Skills + replay data for Phase10\n# Note: Phase10 data should include:\n# - New skills/task data\n# - Replay of Phase07 knowledge data  \n# - Replay of base model's original instruction tuning data\n\n# Training hyperparameters\nmax_tokens_per_gpu = 30_000  # Memory limit per GPU (reduce if hitting OOM errors)\nmax_seq_len = 20_000         # Maximum sequence length\n\n# Distributed training setup (adjust for your hardware)\nnproc_per_node = 8  # Number of GPUs per node\nnnodes = 1          # Number of nodes\nnode_rank = 0       # This node's rank\nrdzv_id = 420       # Rendezvous ID\nrdzv_endpoint = \"0.0.0.0:12345\"  # Master endpoint\n\nprint(f\"LAB Multi-Phase Experiment: {experiment_prefix}\")\nprint(f\"Output directory: {ckpt_output_base_dir}\")\nprint(f\"GPUs per node: {nproc_per_node}\")\nprint(f\"Max tokens per GPU: {max_tokens_per_gpu:,}\")\nprint(f\"\\nData composition:\")\nprint(f\"  Phase07: Knowledge data only\")\nprint(f\"  Phase10: Skills + Phase07 replay + Base model instruction replay\")\nprint(f\"\\n💡 Note: If you encounter OOM (Out of Memory) errors, reduce max_tokens_per_gpu\")"
+   "source": "# LAB Multi-Phase Training Configuration\nexperiment_prefix = \"lab_multiphase_training_demo\"\nckpt_output_base_dir = \"/path/to/your/checkpoints\"  # Update this path\n\n# Model and data paths - Update these to your actual paths\nbase_model_path = \"/path/to/your/base/model\"  # e.g., granite-3.1-8b-starter-v2.1\nphase07_data_path = \"/path/to/knowledge_data.jsonl\"  # Knowledge/facts data for Phase07\nphase10_data_path = \"/path/to/skills_plus_replay_data.jsonl\"  # Skills + replay data for Phase10\n# Note: Phase10 data should include:\n# - New skills/task data\n# - Replay of Phase07 knowledge data  \n# - Replay of base model's original instruction tuning data\n\n# Training hyperparameters\nmax_tokens_per_gpu = 25_000  # Memory limit per GPU (reduce if hitting OOM errors)\nmax_seq_len = 20_000         # Maximum sequence length\n\n# Distributed training setup (adjust for your hardware)\nnproc_per_node = 8  # Number of GPUs per node\nnnodes = 1          # Number of nodes\nnode_rank = 0       # This node's rank\nrdzv_id = 420       # Rendezvous ID\nrdzv_endpoint = \"0.0.0.0:12345\"  # Master endpoint\n\nprint(f\"LAB Multi-Phase Experiment: {experiment_prefix}\")\nprint(f\"Output directory: {ckpt_output_base_dir}\")\nprint(f\"GPUs per node: {nproc_per_node}\")\nprint(f\"Max tokens per GPU: {max_tokens_per_gpu:,}\")\nprint(f\"\\nData composition:\")\nprint(f\"  Phase07: Knowledge data only\")\nprint(f\"  Phase10: Skills + Phase07 replay + Base model instruction replay\")\nprint(f\"\\n💡 Note: If you encounter OOM (Out of Memory) errors, reduce max_tokens_per_gpu\")"
   },
   {
    "cell_type": "markdown",
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@`
`147`	`147`	`"execution_count": null,`
`148`	`148`	`"metadata": {},`
`149`	`149`	`"outputs": [],`
`150`		- "source": "# LAB Multi-Phase Training Configuration\nexperiment_prefix = \"lab_multiphase_training_demo\"\nckpt_output_base_dir = \"/path/to/your/checkpoints\" # Update this path\n\n# Model and data paths - Update these to your actual paths\nbase_model_path = \"/path/to/your/base/model\" # e.g., granite-3.1-8b-starter-v2.1\nphase07_data_path = \"/path/to/knowledge_data.jsonl\" # Knowledge/facts data for Phase07\nphase10_data_path = \"/path/to/skills_plus_replay_data.jsonl\" # Skills + replay data for Phase10\n# Note: Phase10 data should include:\n# - New skills/task data\n# - Replay of Phase07 knowledge data \n# - Replay of base model's original instruction tuning data\n\n# Training hyperparameters\nmax_tokens_per_gpu = 30_000 # Memory limit per GPU (reduce if hitting OOM errors)\nmax_seq_len = 20_000 # Maximum sequence length\n\n# Distributed training setup (adjust for your hardware)\nnproc_per_node = 8 # Number of GPUs per node\nnnodes = 1 # Number of nodes\nnode_rank = 0 # This node's rank\nrdzv_id = 420 # Rendezvous ID\nrdzv_endpoint = \"0.0.0.0:12345\" # Master endpoint\n\nprint(f\"LAB Multi-Phase Experiment: {experiment_prefix}\")\nprint(f\"Output directory: {ckpt_output_base_dir}\")\nprint(f\"GPUs per node: {nproc_per_node}\")\nprint(f\"Max tokens per GPU: {max_tokens_per_gpu:,}\")\nprint(f\"\\nData composition:\")\nprint(f\" Phase07: Knowledge data only\")\nprint(f\" Phase10: Skills + Phase07 replay + Base model instruction replay\")\nprint(f\"\\n💡 Note: If you encounter OOM (Out of Memory) errors, reduce max_tokens_per_gpu\")"
	`150`	+ "source": "# LAB Multi-Phase Training Configuration\nexperiment_prefix = \"lab_multiphase_training_demo\"\nckpt_output_base_dir = \"/path/to/your/checkpoints\" # Update this path\n\n# Model and data paths - Update these to your actual paths\nbase_model_path = \"/path/to/your/base/model\" # e.g., granite-3.1-8b-starter-v2.1\nphase07_data_path = \"/path/to/knowledge_data.jsonl\" # Knowledge/facts data for Phase07\nphase10_data_path = \"/path/to/skills_plus_replay_data.jsonl\" # Skills + replay data for Phase10\n# Note: Phase10 data should include:\n# - New skills/task data\n# - Replay of Phase07 knowledge data \n# - Replay of base model's original instruction tuning data\n\n# Training hyperparameters\nmax_tokens_per_gpu = 25_000 # Memory limit per GPU (reduce if hitting OOM errors)\nmax_seq_len = 20_000 # Maximum sequence length\n\n# Distributed training setup (adjust for your hardware)\nnproc_per_node = 8 # Number of GPUs per node\nnnodes = 1 # Number of nodes\nnode_rank = 0 # This node's rank\nrdzv_id = 420 # Rendezvous ID\nrdzv_endpoint = \"0.0.0.0:12345\" # Master endpoint\n\nprint(f\"LAB Multi-Phase Experiment: {experiment_prefix}\")\nprint(f\"Output directory: {ckpt_output_base_dir}\")\nprint(f\"GPUs per node: {nproc_per_node}\")\nprint(f\"Max tokens per GPU: {max_tokens_per_gpu:,}\")\nprint(f\"\\nData composition:\")\nprint(f\" Phase07: Knowledge data only\")\nprint(f\" Phase10: Skills + Phase07 replay + Base model instruction replay\")\nprint(f\"\\n💡 Note: If you encounter OOM (Out of Memory) errors, reduce max_tokens_per_gpu\")"
`151`	`151`	`},`
`152`	`152`	`{`
`153`	`153`	`"cell_type": "markdown",`