Enable FP8 full finetune distributed

andrewor14 · andrewor14 · commit 52d97a5f65c6 · 2025-04-02T15:25:41.000-07:00
TODO: write this Based on #2404 by @nathan-az
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -19,6 +19,7 @@
 from torch.distributed._tensor import DTensor
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.optim import Optimizer
+from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp
 from torchdata.stateful_dataloader import StatefulDataLoader
 from torchdata.stateful_dataloader.sampler import StatefulDistributedSampler
 from torchtune import config, modules, training, utils
@@ -184,6 +185,8 @@ def __init__(self, cfg: DictConfig) -> None:
         self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._checkpoint_client = CheckpointClient(cfg)
+        self._enable_fp8_training = cfg.get("enable_fp8_training", False)
+        self._fp8_recipe_name = cfg.get("fp8_recipe_name", None)
 
         # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
         if self._optimizer_in_bwd:
@@ -559,6 +562,10 @@ def _setup_model(
                 parallelize_plan=self.tp_plan,
             )
 
+        if self._enable_fp8_training:
+            # TODO: gate on nightlies?
+            model = convert_to_float8_training(model, self._fp8_recipe_name)
+
         # We currently have two versions of activation checkpointing in this recipe
         # for testing and BC purposes. ``enable_activation_checkpointing`` controls
         # the older version of AC and this behavior is unchanged
@@ -846,6 +853,12 @@ def train(self) -> None:
                     if self._lr_scheduler is not None:
                         self._lr_scheduler.step()
 
+                    # If float8 training is enabled, perform a single all-reduce to compute the
+                    # scale for all float8 parameters efficiently instead of doing many small
+                    # all-reduces for each parameter
+                    if self._enable_fp8_training and self.dp_degree > 1:
+                        precompute_float8_dynamic_scale_for_fsdp(self._model)
+
                     loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
diff --git a/torchtune/models/llama3/_parallelism.py b/torchtune/models/llama3/_parallelism.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Dict
+from typing import Dict, Type
 
 from torch.distributed.tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
@@ -16,31 +16,40 @@
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
-# Define the Tensor Parallel plan for Llama3 model, which will also be shared with 3.1, 3.2, and 3.3 models
-BASE_LLAMA_TP_PLAN = {
-    "tok_embeddings": RowwiseParallel(
-        input_layouts=Replicate(), output_layouts=Shard(1)
-    ),
-    "norm": SequenceParallel(),
-    "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()),
-    "layers.*.attn": PrepareModuleInput(
-        input_layouts=(Shard(1), None),
-        desired_input_layouts=(Replicate(), None),
-    ),
-    "layers.*.mlp": PrepareModuleInput(
-        input_layouts=(Shard(1),),
-        desired_input_layouts=(Replicate(),),
-    ),
-    "layers.*.sa_norm": SequenceParallel(),
-    "layers.*.mlp_norm": SequenceParallel(),
-    "layers.*.attn.q_proj": ColwiseParallel(),
-    "layers.*.attn.k_proj": ColwiseParallel(),
-    "layers.*.attn.v_proj": ColwiseParallel(),
-    "layers.*.attn.output_proj": RowwiseParallel(output_layouts=Shard(1)),
-    "layers.*.mlp.w1": ColwiseParallel(),
-    "layers.*.mlp.w2": RowwiseParallel(output_layouts=Shard(1)),
-    "layers.*.mlp.w3": ColwiseParallel(),
-}
+def _get_base_llama_tp_plan(
+    _sequence_parallel_cls: Type[ParallelStyle] = SequenceParallel,
+    _colwise_parallel_cls: Type[ParallelStyle] = ColwiseParallel,
+    _rowwise_parallel_cls: Type[ParallelStyle] = RowwiseParallel,
+) -> Dict[str, ParallelStyle]:
+    """
+    Define the Tensor Parallel plan for Llama3 model, which will also be shared with 3.1, 3.2, and 3.3 models.
+    """
+    return {
+        "tok_embeddings": _rowwise_parallel_cls(
+            input_layouts=Replicate(), output_layouts=Shard(1)
+        ),
+        "norm": _sequence_parallel_cls(),
+        "output": _colwise_parallel_cls(
+            input_layouts=Shard(1), output_layouts=Replicate()
+        ),
+        "layers.*.attn": PrepareModuleInput(
+            input_layouts=(Shard(1), None),
+            desired_input_layouts=(Replicate(), None),
+        ),
+        "layers.*.mlp": PrepareModuleInput(
+            input_layouts=(Shard(1),),
+            desired_input_layouts=(Replicate(),),
+        ),
+        "layers.*.sa_norm": _sequence_parallel_cls(),
+        "layers.*.mlp_norm": _sequence_parallel_cls(),
+        "layers.*.attn.q_proj": _colwise_parallel_cls(),
+        "layers.*.attn.k_proj": _colwise_parallel_cls(),
+        "layers.*.attn.v_proj": _colwise_parallel_cls(),
+        "layers.*.attn.output_proj": _rowwise_parallel_cls(output_layouts=Shard(1)),
+        "layers.*.mlp.w1": _colwise_parallel_cls(),
+        "layers.*.mlp.w2": _rowwise_parallel_cls(output_layouts=Shard(1)),
+        "layers.*.mlp.w3": _colwise_parallel_cls(),
+    }
 
 
 def base_llama_tp_plan() -> Dict[str, ParallelStyle]:
@@ -50,4 +59,19 @@ def base_llama_tp_plan() -> Dict[str, ParallelStyle]:
     Returns:
         Dict[str, Any]: The tensor parallel plan for Llama3 model.
     """
-    return BASE_LLAMA_TP_PLAN
+    return _get_base_llama_tp_plan()
+
+
+def fp8_llama_tp_plan() -> Dict[str, ParallelStyle]:
+    """
+    Return the tensor parallel plan for Llama3 model that uses float8 for all-gather for both
+    rowwise and colwise computation, currently only compatible with float8 fine-tuning with
+    "tensorwise" scaling. This tensor parallel plan is shared between 3.1, 3.2, and 3.3 models.
+
+    Returns:
+        Dict[str, Any]: The float8-enabled tensor parallel plan for Llama3 model.
+    """
+    return _get_base_llama_tp_plan(
+        _colwise_parallel_cls=Float8ColwiseParallel,
+        _rowwise_parallel_cls=Float8RowwiseParallel,
+    )
diff --git a/torchtune/training/quantization.py b/torchtune/training/quantization.py
@@ -9,13 +9,15 @@
 from torch import nn
 
 from torchao.dtypes import TensorCoreTiledLayout
-
+from torchao.float8 import (
+    convert_to_float8_training as _convert_to_float8_training_torchao,
+    Float8LinearConfig,
+)
 from torchao.quantization import (
     int4_weight_only,
     int8_dynamic_activation_int4_weight,
     quantize_,
 )
-
 from torchao.quantization.qat import (
     Int4WeightOnlyQATQuantizer,
     Int8DynActInt4WeightQATQuantizer,
@@ -26,6 +28,7 @@
     enable_4w_fake_quant,
     enable_8da4w_fake_quant,
 )
+
 from torchtune.modules.peft.lora import LoRALinear, QATLoRALinear
 
 
@@ -219,3 +222,35 @@ def swap_lora_linear_with_qat(
                 activation_qat_config,
                 weight_qat_config,
             )
+
+
+def convert_to_float8_training(
+    model: nn.Module,
+    fp8_recipe_name: Optional[str] = None,
+) -> nn.Module:
+    """
+    Prepare the model for float8 training by swapping all `nn.Linear` with `Float8Linear`.
+
+    Args:
+        model (nn.Module): The model to swap linear layers on
+        fp8_recipe_name (Optional[str]): name to identify one of the pre-made recipes,
+            one of "tensorwise", "rowwise", and "rowwise_with_gw_hp". If not specified,
+            defaults to "tensorwise" with "enable_fsdp_float8_all_gather=True". See
+            https://github.com/pytorch/ao/blob/v0.9.0/torchao/float8/config.py#L150
+            for more details.
+
+    Returns:
+        (nn.Module) The new model with `Float8Linear`.
+    """
+    print(
+        "doing fp8 quantized training, fp8_recipe_name = %s" % fp8_recipe_name or "N/A"
+    )
+    if fp8_recipe_name is not None:
+        fp8_config = Float8LinearRecipeName(fp8_recipe_name)
+    else:
+        fp8_config = Float8LinearConfig(enable_fsdp_float8_all_gather=True)
+    return _convert_to_float8_training_torchao(
+        model,
+        config=fp8_config,
+        module_filter_fn=lambda mod, fqn: fqn != "output",
+    )