- "source": "# LAB Multi-Phase Training Configuration\nexperiment_prefix = \"lab_multiphase_training_demo\"\nckpt_output_base_dir = \"/path/to/your/checkpoints\" # Update this path\n\n# Model and data paths - Update these to your actual paths\nbase_model_path = \"/path/to/your/base/model\" # e.g., granite-3.1-8b-starter-v2.1\nphase07_data_path = \"/path/to/knowledge_data.jsonl\" # Knowledge/facts data for Phase07\nphase10_data_path = \"/path/to/skills_plus_replay_data.jsonl\" # Skills + replay data for Phase10\n# Note: Phase10 data should include:\n# - New skills/task data\n# - Replay of Phase07 knowledge data \n# - Replay of base model's original instruction tuning data\n\n# Training hyperparameters\nmax_tokens_per_gpu = 30_000 # Memory limit per GPU (reduce if hitting OOM errors)\nmax_seq_len = 20_000 # Maximum sequence length\n\n# Distributed training setup (adjust for your hardware)\nnproc_per_node = 8 # Number of GPUs per node\nnnodes = 1 # Number of nodes\nnode_rank = 0 # This node's rank\nrdzv_id = 420 # Rendezvous ID\nrdzv_endpoint = \"0.0.0.0:12345\" # Master endpoint\n\nprint(f\"LAB Multi-Phase Experiment: {experiment_prefix}\")\nprint(f\"Output directory: {ckpt_output_base_dir}\")\nprint(f\"GPUs per node: {nproc_per_node}\")\nprint(f\"Max tokens per GPU: {max_tokens_per_gpu:,}\")\nprint(f\"\\nData composition:\")\nprint(f\" Phase07: Knowledge data only\")\nprint(f\" Phase10: Skills + Phase07 replay + Base model instruction replay\")\nprint(f\"\\n💡 Note: If you encounter OOM (Out of Memory) errors, reduce max_tokens_per_gpu\")"
0 commit comments