feat(framework): enable amp by options

caaatch22 · ctios · commit db5bbe7a0194 · 2025-12-24T16:03:58.000+08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers=[
 ]
 dependencies = [
     "torch>=2.0",
-    "accelerate==0.29.2",
+    "accelerate>=1.12.0",
     "simple-parsing"
 ]
 
@@ -201,4 +201,4 @@ select = [
     "TRY401", # verbose-log-message
     "UP",
     "YTT",
-]
+]
diff --git a/recis/__init__.py b/recis/__init__.py
@@ -3,7 +3,7 @@
 import torch
 
 
-__version__ = "1.0.25"
+__version__ = "1.0.26"
 
 pkg_path = os.path.dirname(os.path.realpath(__file__))
 lib_path = os.path.join(pkg_path, "lib")
diff --git a/recis/framework/trainer.py b/recis/framework/trainer.py
@@ -67,6 +67,7 @@ class TrainingArguments:
         saver_option (Optional[SaverOptions]): Options for checkpoint saver. Defaults to None.
         ckpt_save_arg (Optional[CheckpointSaveArguments]): Arguments for checkpoint save. Defaults to None.
         ckpt_load_arg (Optional[CheckpointLoadArguments]): Arguments for checkpoint load. Defaults to None.
+        mixed_precision (Optional[str]): Mixed precision training mode. Defaults to None. Only support "bf16" and "fp16".
     """
 
     gradient_accumulation_steps: int = 1
@@ -90,6 +91,7 @@ class TrainingArguments:
     saver_option: Optional[SaverOptions] = None
     ckpt_save_arg: Optional[CheckpointSaveArguments] = None
     ckpt_load_arg: Optional[CheckpointLoadArguments] = None
+    mixed_precision: Optional[str] = None
 
 
 class Trainer:
@@ -197,11 +199,15 @@ def __init__(
         self.dense_lr_scheduler = dense_optimizers[1]
         self.sparse_optimizer = sparse_optimizer
         self.data_to_cuda = data_to_cuda
+        self.mixed_precision = args.mixed_precision
+        if self.mixed_precision is not None:
+            assert self.mixed_precision in ["bf16", "fp16"], "mixed_precision must be 'bf16' or 'fp16'"
         ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
         init_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=1800))
         self.accelerator = Accelerator(
             kwargs_handlers=[ddp_kwargs, init_kwargs],
             gradient_accumulation_steps=args.gradient_accumulation_steps,
+            mixed_precision=self.mixed_precision,
             **kwargs,
         )
         self.gradient_accumulation_steps = args.gradient_accumulation_steps
@@ -559,7 +565,8 @@ def _train_step(self, data, epoch, metrics):
         self.dense_optimizer.zero_grad()
         if self.sparse_optimizer is not None:
             self.sparse_optimizer.zero_grad()
-        loss = self.model(data)
+        with self.accelerator.autocast():
+            loss = self.model(data)
         metrics.update(epoch=epoch)
         metrics.update(loss=loss)
         metrics.update(get_global_metrics())

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ classifiers=[`
`19`	`19`	`]`
`20`	`20`	`dependencies = [`
`21`	`21`	`"torch>=2.0",`
`22`		`- "accelerate==0.29.2",`
	`22`	`+ "accelerate>=1.12.0",`
`23`	`23`	`"simple-parsing"`
`24`	`24`	`]`
`25`	`25`
`@@ -201,4 +201,4 @@ select = [`
`201`	`201`	`"TRY401", # verbose-log-message`
`202`	`202`	`"UP",`
`203`	`203`	`"YTT",`
`204`		`-]`
	`204`	`+]`