@@ -36,14 +36,11 @@ def execute_training(self, algorithm_params: Dict[str, Any]) -> Any:
3636 "Install LoRA dependencies with: pip install 'training-hub[lora]'"
3737 ) from e
3838
39- # Separate torchrun parameters from training parameters
40- torchrun_keys = {'nproc_per_node' , 'nnodes' , 'node_rank' , 'rdzv_id' , 'rdzv_endpoint' , 'master_addr' , 'master_port' }
41-
42- # Extract torchrun parameters
43- torchrun_params = {k : v for k , v in algorithm_params .items () if k in torchrun_keys }
44-
45- # Extract training parameters (everything except torchrun params)
46- training_params = {k : v for k , v in algorithm_params .items () if k not in torchrun_keys }
39+ # Use all parameters as training parameters
40+ # Note: Torchrun parameters (nproc_per_node, etc.) are handled by the torchrun launcher,
41+ # not by the Python training code. The training code auto-detects distributed environment
42+ # via environment variables (WORLD_SIZE, LOCAL_RANK, etc.) set by torchrun.
43+ training_params = algorithm_params
4744
4845 # Unsloth multi-GPU setup: Let Accelerate/torchrun handle distributed training
4946 # No custom distributed initialization needed - Unsloth works with standard PyTorch DDP
@@ -60,9 +57,6 @@ def execute_training(self, algorithm_params: Dict[str, Any]) -> Any:
6057 # Configure training arguments
6158 training_args = self ._build_training_args (training_params )
6259
63- # Determine dataset format and configure trainer accordingly
64- dataset_type = training_params .get ('dataset_type' , 'chat_template' )
65-
6660 # Use the same trainer configuration for all dataset types since we pre-process in _prepare_dataset
6761 trainer = SFTTrainer (
6862 model = model ,
@@ -97,7 +91,6 @@ def _load_unsloth_model(self, params: Dict[str, Any]) -> tuple:
9791
9892 # Handle device placement for multi-GPU training
9993 device_map_config = {}
100- import os
10194 if params .get ('enable_model_splitting' , False ):
10295 # Use balanced device mapping for large models
10396 device_map_config ['device_map' ] = "balanced"
@@ -108,11 +101,22 @@ def _load_unsloth_model(self, params: Dict[str, Any]) -> tuple:
108101 torch .cuda .set_device (local_rank )
109102 device_map_config ['device_map' ] = {"" : local_rank }
110103
104+ # Configure quantization options with BitsAndBytes defaults
105+ quantization_kwargs = {}
106+ if load_in_4bit :
107+ quantization_kwargs .update ({
108+ 'bnb_4bit_quant_type' : params .get ('bnb_4bit_quant_type' , 'nf4' ), # BitsAndBytes default
109+ 'bnb_4bit_compute_dtype' : params .get ('bnb_4bit_compute_dtype' , 'bfloat16' ), # BitsAndBytes default
110+ 'bnb_4bit_use_double_quant' : params .get ('bnb_4bit_use_double_quant' , True ), # BitsAndBytes default
111+ })
112+
111113 model , tokenizer = FastLanguageModel .from_pretrained (
112114 model_name = params ['model_path' ],
113115 max_seq_length = params .get ('max_seq_len' , 2048 ),
114116 dtype = None , # Auto-detect
115117 load_in_4bit = load_in_4bit ,
118+ load_in_8bit = load_in_8bit ,
119+ ** quantization_kwargs ,
116120 ** device_map_config ,
117121 # Additional Unsloth optimizations
118122 # trust_remote_code=params.get('trust_remote_code', False),
@@ -136,7 +140,7 @@ def _apply_lora_config(self, model, params: Dict[str, Any]):
136140 r = params .get ('lora_r' , 16 ),
137141 target_modules = target_modules ,
138142 lora_alpha = params .get ('lora_alpha' , 32 ),
139- lora_dropout = params .get ('lora_dropout' , 0.1 ),
143+ lora_dropout = params .get ('lora_dropout' , 0.0 ), # 0.0 is optimized for Unsloth
140144 bias = "none" ,
141145 use_gradient_checkpointing = "unsloth" , # Unsloth's optimized gradient checkpointing
142146 random_state = params .get ('seed' , 3407 ),
@@ -205,7 +209,6 @@ def _build_training_args(self, params: Dict[str, Any]):
205209
206210 # Determine actual number of GPUs being used
207211 import torch
208- import os
209212
210213 # If we're in a distributed environment, use world size
211214 if 'WORLD_SIZE' in os .environ :
@@ -380,7 +383,7 @@ def train(self,
380383 LoRA Parameters (from PEFT extender):
381384 lora_r: LoRA rank (default: 16)
382385 lora_alpha: LoRA alpha parameter (default: 32)
383- lora_dropout: LoRA dropout rate (default: 0.1 )
386+ lora_dropout: LoRA dropout rate (default: 0.0, optimized for Unsloth )
384387 target_modules: List of module names to apply LoRA to (default: auto-detect)
385388 use_rslora: Use Rank-Stabilized LoRA (default: False)
386389 use_dora: Use DoRA (Weight-Decomposed Low-Rank Adaptation) (default: False)
@@ -604,6 +607,8 @@ def get_optional_params(self) -> Dict[str, Type]:
604607 'field_output' : str ,
605608 # Multi-GPU model splitting
606609 'enable_model_splitting' : bool ,
610+ # Model saving
611+ 'save_model' : bool ,
607612 }
608613
609614 # Combine all parameter types
@@ -689,17 +694,17 @@ def lora_sft(model_path: str,
689694 LoRA Parameters:
690695 lora_r: LoRA rank (default: 16)
691696 lora_alpha: LoRA alpha parameter (default: 32)
692- lora_dropout: LoRA dropout rate (default: 0.1 )
697+ lora_dropout: LoRA dropout rate (default: 0.0, optimized for Unsloth )
693698 target_modules: List of module names to apply LoRA to (default: auto-detect)
694699
695700 Training Parameters:
696701 num_epochs: Number of training epochs (default: 3)
697702 effective_batch_size: Effective batch size across all GPUs
698- micro_batch_size: Batch size per GPU (default: 1 )
703+ micro_batch_size: Batch size per GPU (default: 2 )
699704 gradient_accumulation_steps: Steps to accumulate gradients (default: 1)
700705 learning_rate: Learning rate (default: 2e-4)
701706 max_seq_len: Maximum sequence length (default: 2048)
702- lr_scheduler: Learning rate scheduler (default: 'cosine ')
707+ lr_scheduler: Learning rate scheduler (default: 'linear ')
703708 warmup_steps: Number of warmup steps (default: 10)
704709
705710 Quantization Parameters (QLoRA):
0 commit comments