Enable FP8 full finetune distributed

andrewor14 · andrewor14 · commit df526dd2383b · 2025-04-04T15:23:17.000-07:00
TODO: write this Based on #2404 by @nathan-az
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -19,6 +19,7 @@
 from torch.distributed._tensor import DTensor
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.optim import Optimizer
+from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp
 from torchdata.stateful_dataloader import StatefulDataLoader
 from torchdata.stateful_dataloader.sampler import StatefulDistributedSampler
 from torchtune import config, modules, training, utils
@@ -33,6 +34,11 @@
     TrainingProgress,
 )
 from torchtune.training.lr_schedulers import get_lr
+from torchtune.training.quantization import (
+    convert_to_float8_training,
+    is_fp8_tensorwise_scaling,
+    validate_float8_tp_plan,
+)
 
 from tqdm import tqdm
 
@@ -184,6 +190,8 @@ def __init__(self, cfg: DictConfig) -> None:
         self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._checkpoint_client = CheckpointClient(cfg)
+        self._enable_fp8_training = cfg.get("enable_fp8_training", False)
+        self._fp8_recipe_name = cfg.get("fp8_recipe_name", None)
 
         # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
         if self._optimizer_in_bwd:
@@ -545,6 +553,15 @@ def _setup_model(
         if self._compile:
             training.compile_model(model, verbose=self._is_rank_zero)
 
+        if self._enable_fp8_training:
+            # Requires https://github.com/pytorch/pytorch/pull/148922
+            if torch.__version__ < "2.8.0.dev20250318":
+                raise RuntimeError(
+                    "Float8 fine-tuning requires PyTorch 2.8.0.dev20250318 or later."
+                )
+            validate_float8_tp_plan(self.tp_plan, self._fp8_recipe_name)
+            model = convert_to_float8_training(model, self._fp8_recipe_name)
+
         # Apply tensor parallelism to the model
         if self.parallel_dims.tp_enabled:
             if not self.parallel_dims.dp_enabled and self.fsdp_cpu_offload:
@@ -846,6 +863,16 @@ def train(self) -> None:
                     if self._lr_scheduler is not None:
                         self._lr_scheduler.step()
 
+                    # If float8 training is enabled, perform a single all-reduce to compute the
+                    # scale for all float8 parameters efficiently instead of doing many small
+                    # all-reduces for each parameter
+                    if (
+                        self._enable_fp8_training
+                        and is_fp8_tensorwise_scaling(self._fp8_recipe_name)
+                        and self.dp_degree > 1
+                    ):
+                        precompute_float8_dynamic_scale_for_fsdp(self._model)
+
                     loss_to_log = running_loss.item() / num_tokens
                     pbar.update(1)
                     pbar.set_description(
diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py
@@ -15,7 +15,7 @@
     qlora_llama3_70b,
     qlora_llama3_8b,
 )
-from ._parallelism import base_llama_tp_plan
+from ._parallelism import base_llama_tp_plan, fp8_llama_tp_plan
 from ._tokenizer import Llama3Tokenizer
 
 __all__ = [
@@ -30,4 +30,5 @@
     "qlora_llama3_8b",
     "qlora_llama3_70b",
     "base_llama_tp_plan",
+    "fp8_llama_tp_plan",
 ]
diff --git a/torchtune/models/llama3/_parallelism.py b/torchtune/models/llama3/_parallelism.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Dict
+from typing import Dict, Type
 
 from torch.distributed.tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
@@ -15,32 +15,53 @@
 )
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
+from torchao.float8.float8_tensor_parallel import (
+    Float8ColwiseParallel,
+    Float8RowwiseParallel,
+)
+
 
-# Define the Tensor Parallel plan for Llama3 model, which will also be shared with 3.1, 3.2, and 3.3 models
-BASE_LLAMA_TP_PLAN = {
-    "tok_embeddings": RowwiseParallel(
-        input_layouts=Replicate(), output_layouts=Shard(1)
-    ),
-    "norm": SequenceParallel(),
-    "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()),
-    "layers.*.attn": PrepareModuleInput(
-        input_layouts=(Shard(1), None),
-        desired_input_layouts=(Replicate(), None),
-    ),
-    "layers.*.mlp": PrepareModuleInput(
-        input_layouts=(Shard(1),),
-        desired_input_layouts=(Replicate(),),
-    ),
-    "layers.*.sa_norm": SequenceParallel(),
-    "layers.*.mlp_norm": SequenceParallel(),
-    "layers.*.attn.q_proj": ColwiseParallel(),
-    "layers.*.attn.k_proj": ColwiseParallel(),
-    "layers.*.attn.v_proj": ColwiseParallel(),
-    "layers.*.attn.output_proj": RowwiseParallel(output_layouts=Shard(1)),
-    "layers.*.mlp.w1": ColwiseParallel(),
-    "layers.*.mlp.w2": RowwiseParallel(output_layouts=Shard(1)),
-    "layers.*.mlp.w3": ColwiseParallel(),
-}
+def _get_base_llama_tp_plan(
+    _sequence_parallel_cls: Type[ParallelStyle] = SequenceParallel,
+    _colwise_parallel_cls: Type[ParallelStyle] = ColwiseParallel,
+    _rowwise_parallel_cls: Type[ParallelStyle] = RowwiseParallel,
+) -> Dict[str, ParallelStyle]:
+    """
+    Define the Tensor Parallel plan for Llama3 model, which will also be shared with 3.1, 3.2, and 3.3 models.
+    """
+    return {
+        "tok_embeddings": _rowwise_parallel_cls(
+            input_layouts=Replicate(), output_layouts=Shard(1)
+        ),
+        "norm": _sequence_parallel_cls(),
+        "output": _colwise_parallel_cls(
+            input_layouts=Shard(1), output_layouts=Replicate()
+        ),
+        "layers.*.attn": PrepareModuleInput(
+            input_layouts=(Shard(1), None),
+            desired_input_layouts=(Replicate(), None),
+        ),
+        "layers.*.mlp": PrepareModuleInput(
+            input_layouts=(Shard(1),),
+            desired_input_layouts=(Replicate(),),
+        ),
+        "layers.*.sa_norm": _sequence_parallel_cls(),
+        "layers.*.mlp_norm": _sequence_parallel_cls(),
+        "layers.*.attn.q_proj": _colwise_parallel_cls(),
+        "layers.*.attn.k_proj": _colwise_parallel_cls(),
+        "layers.*.attn.v_proj": _colwise_parallel_cls(),
+        "layers.*.attn.output_proj": _rowwise_parallel_cls(output_layouts=Shard(1)),
+        "layers.*.mlp.w1": _colwise_parallel_cls(),
+        "layers.*.mlp.w2": _rowwise_parallel_cls(output_layouts=Shard(1)),
+        "layers.*.mlp.w3": _colwise_parallel_cls(),
+    }
+
+
+_BASE_LLAMA_TP_PLAN = _get_base_llama_tp_plan()
+_FP8_LLAMA_TP_PLAN = _get_base_llama_tp_plan(
+    _colwise_parallel_cls=Float8ColwiseParallel,
+    _rowwise_parallel_cls=Float8RowwiseParallel,
+)
 
 
 def base_llama_tp_plan() -> Dict[str, ParallelStyle]:
@@ -50,4 +71,16 @@ def base_llama_tp_plan() -> Dict[str, ParallelStyle]:
     Returns:
         Dict[str, Any]: The tensor parallel plan for Llama3 model.
     """
-    return BASE_LLAMA_TP_PLAN
+    return _BASE_LLAMA_TP_PLAN
+
+
+def fp8_llama_tp_plan() -> Dict[str, ParallelStyle]:
+    """
+    Return the tensor parallel plan for Llama3 model that uses float8 for all-gather for both
+    rowwise and colwise computation, currently only compatible with float8 fine-tuning with
+    "tensorwise" scaling. This tensor parallel plan is shared between 3.1, 3.2, and 3.3 models.
+
+    Returns:
+        Dict[str, Any]: The float8-enabled tensor parallel plan for Llama3 model.
+    """
+    return _FP8_LLAMA_TP_PLAN
diff --git a/torchtune/training/quantization.py b/torchtune/training/quantization.py
@@ -4,18 +4,25 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Optional
+from typing import Callable, Dict, Optional
 
 from torch import nn
+from torch.distributed.tensor.parallel.style import ParallelStyle
 
 from torchao.dtypes import TensorCoreTiledLayout
-
+from torchao.float8 import (
+    convert_to_float8_training as _convert_to_float8_training_torchao,
+    Float8LinearConfig,
+)
+from torchao.float8.float8_tensor_parallel import (
+    Float8ColwiseParallel,
+    Float8RowwiseParallel,
+)
 from torchao.quantization import (
     int4_weight_only,
     int8_dynamic_activation_int4_weight,
     quantize_,
 )
-
 from torchao.quantization.qat import (
     Int4WeightOnlyQATQuantizer,
     Int8DynActInt4WeightQATQuantizer,
@@ -26,6 +33,7 @@
     enable_4w_fake_quant,
     enable_8da4w_fake_quant,
 )
+
 from torchtune.modules.peft.lora import LoRALinear, QATLoRALinear
 
 
@@ -219,3 +227,60 @@ def swap_lora_linear_with_qat(
                 activation_qat_config,
                 weight_qat_config,
             )
+
+
+def convert_to_float8_training(
+    model: nn.Module,
+    fp8_recipe_name: Optional[str] = None,
+) -> nn.Module:
+    """
+    Prepare the model for float8 training by swapping all `nn.Linear` with `Float8Linear`.
+
+    Args:
+        model (nn.Module): The model to swap linear layers on
+        fp8_recipe_name (Optional[str]): name to identify one of the pre-made recipes,
+            one of "tensorwise", "rowwise", and "rowwise_with_gw_hp". If not specified,
+            defaults to "tensorwise" with "enable_fsdp_float8_all_gather=True". See
+            https://github.com/pytorch/ao/blob/v0.9.0/torchao/float8/config.py#L150
+            for more details.
+
+    Returns:
+        (nn.Module) The new model with `Float8Linear`.
+    """
+    if fp8_recipe_name is not None:
+        fp8_config = Float8LinearRecipeName(fp8_recipe_name)
+    else:
+        fp8_config = Float8LinearConfig(enable_fsdp_float8_all_gather=True)
+    return _convert_to_float8_training_torchao(
+        model,
+        config=fp8_config,
+        module_filter_fn=lambda mod, fqn: fqn != "output",
+    )
+
+
+def validate_float8_tp_plan(
+    tp_plan: Optional[Dict[str, ParallelStyle]],
+    fp8_recipe_name: Optional[str] = None,
+) -> None:
+    """
+    Validate that the provided tensor parallel plan is compatible with the
+    float8 settings. Specifically, float8 tensor parallel plans are only
+    supported when using 'tensorwise' float8 recipes.
+    """
+    if tp_plan is None or is_fp8_tensorwise_scaling(fp8_recipe_name):
+        return
+    for parallel_style in tp_plan.values():
+        if isinstance(parallel_style, Float8ColwiseParallel) or isinstance(
+            parallel_style, Float8RowwiseParallel
+        ):
+            raise ValueError(
+                "%s and %s are only compatible with 'tensorwise' float8 recipes"
+                % (Float8ColwiseParallel.__name__, Float8RowwiseParallel.__name__)
+            )
+
+
+def is_fp8_tensorwise_scaling(fp8_recipe_name: Optional[str]):
+    """
+    Return True if the fp8 recipe name refers to 'tensorwwise' scaling.
+    """
+    return fp8_recipe_name is None or fp8_recipe_name == "tensorwise"

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`qlora_llama3_70b,`
`16`	`16`	`qlora_llama3_8b,`
`17`	`17`	`)`
`18`		`-from ._parallelism import base_llama_tp_plan`
	`18`	`+from ._parallelism import base_llama_tp_plan, fp8_llama_tp_plan`
`19`	`19`	`from ._tokenizer import Llama3Tokenizer`
`20`	`20`
`21`	`21`	`__all__ = [`
`@@ -30,4 +30,5 @@`
`30`	`30`	`"qlora_llama3_8b",`
`31`	`31`	`"qlora_llama3_70b",`
`32`	`32`	`"base_llama_tp_plan",`
	`33`	`+ "fp8_llama_tp_plan",`
`33`	`34`	`]`