[Perf] AdamWConfig: enable fused AdamW kernel

HAOCHENYE · HAOCHENYE · commit 874bc3e6cb8d · 2026-06-27T05:43:07.000Z
Pass ``fused=True`` to ``torch.optim.AdamW`` on the non-foreach path. The fused
kernel folds the per-parameter element-wise AdamW update into a single CUDA
launch, cutting launch overhead and optimizer-step time at large parameter
counts.
diff --git a/xtuner/v1/config/optim.py b/xtuner/v1/config/optim.py
@@ -63,7 +63,7 @@ def build(self, model):
                 foreach=self.foreach,
             )
         return torch.optim.AdamW(
-            params, lr=self.lr, betas=self.betas, eps=self.eps, weight_decay=self.weight_decay, foreach=self.foreach
+            params, lr=self.lr, betas=self.betas, eps=self.eps, weight_decay=self.weight_decay, foreach=self.foreach, fused=True,
         )
 
 

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ def build(self, model):`
`63`	`63`	`foreach=self.foreach,`
`64`	`64`	`)`
`65`	`65`	`return torch.optim.AdamW(`
`66`		`- params, lr=self.lr, betas=self.betas, eps=self.eps, weight_decay=self.weight_decay, foreach=self.foreach`
	`66`	`+ params, lr=self.lr, betas=self.betas, eps=self.eps, weight_decay=self.weight_decay, foreach=self.foreach, fused=True,`
`67`	`67`	`)`
`68`	`68`
`69`	`69`