Added Llama 3 Tensor Parallelism

cehongwang · cehongwang · commit 22c9c54df09e · 2024-08-05T16:53:46.000-07:00
diff --git a/examples/distributed_inference/llama3_model.py b/examples/distributed_inference/llama3_model.py
@@ -8,6 +8,15 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+    SequenceParallel,
+    parallelize_module,
+)
 
 
 @dataclass
@@ -27,6 +36,7 @@ class ModelArgs:
     # If `True`, then each transformer block init uses its layer ID, and if
     # `False`, each uses the total number of transformer blocks
     depth_init: bool = True
+    device: str = "cuda"
 
 
 def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
@@ -168,14 +178,22 @@ class Attention(nn.Module):
     def __init__(self, model_args: ModelArgs):
         super().__init__()
         self.n_heads = model_args.n_heads
-        self.n_kv_heads = model_args.n_heads if model_args.n_kv_heads is None else model_args.n_kv_heads
+        self.n_kv_heads = (
+            model_args.n_heads
+            if model_args.n_kv_heads is None
+            else model_args.n_kv_heads
+        )
         self.n_rep = self.n_heads // self.n_kv_heads
         self.head_dim = model_args.dim // model_args.n_heads
 
-        self.wq = nn.Linear(model_args.dim, model_args.n_heads * self.head_dim, bias=False)
+        self.wq = nn.Linear(
+            model_args.dim, model_args.n_heads * self.head_dim, bias=False
+        )
         self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
         self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(model_args.n_heads * self.head_dim, model_args.dim, bias=False)
+        self.wo = nn.Linear(
+            model_args.n_heads * self.head_dim, model_args.dim, bias=False
+        )
 
     def init_weights(self, init_std: float):
         for linear in (self.wq, self.wk, self.wv):
@@ -216,7 +234,9 @@ def forward(
 
         # we use casual mask for training
         output = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True)
-        output = output.transpose(1, 2).contiguous()  # (bs, seqlen, n_local_heads, head_dim)
+        output = output.transpose(
+            1, 2
+        ).contiguous()  # (bs, seqlen, n_local_heads, head_dim)
         output = output.view(bs, seqlen, -1)
         return self.wo(output)
 
@@ -330,7 +350,7 @@ def init_weights(self):
         self.feed_forward.init_weights(self.weight_init_std)
 
 
-class Transformer(nn.Module):
+class ParallelTransformer(nn.Module):
     """Transformer Module.
 
     Args:
@@ -348,13 +368,16 @@ class Transformer(nn.Module):
 
     """
 
-    def __init__(self, model_args: ModelArgs):
+    def __init__(self, model_args: ModelArgs, tp_mesh: DeviceMesh):
+        # Here we use distributed model initialization to avoid memory overflow
         super().__init__()
         self.model_args = model_args
         self.vocab_size = model_args.vocab_size
         self.n_layers = model_args.n_layers
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
+        self.tok_embeddings.to(model_args.device)
+        self.tok_embeddings = self.parallel_embeddings(self.tok_embeddings, tp_mesh)
 
         # TODO persistent should be set to false, since this buffer can be recomputed.
         # however, we set it to true for 2 reasons.  (1) due to pytorch/pytorch#123411,
@@ -363,17 +386,83 @@ def __init__(self, model_args: ModelArgs):
         # a seed checkpoint rather than calling init_weights, we need freqs_cis to be
         # initialized by the checkpoint, or we need to add a separate initializer for
         # just the non-persistent buffers that is called after loading checkpoints.
-        self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True)
+        self.register_buffer(
+            "freqs_cis",
+            self._precompute_freqs_cis().to(model_args.device),
+            persistent=True,
+        )
 
-        self.layers = torch.nn.ModuleDict()
+        self.layers = torch.nn.ModuleDict().to(model_args.device)
         for layer_id in range(model_args.n_layers):
-            self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args)
+            block = TransformerBlock(layer_id, model_args).to(model_args.device)
+            self.layers[str(layer_id)] = block
+            self.parallel_transformer_block(self.layers[str(layer_id)], tp_mesh)
+            print(layer_id)
 
-        self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps)
-
-        self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
+        self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps).to(
+            model_args.device
+        )
+        self.norm = self.parallel_norm(self.norm, tp_mesh)
+        self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False).to(
+            model_args.device
+        )
+        self.output = self.parallel_output(self.output, tp_mesh)
         self.init_weights()
 
+    def parallel_transformer_block(self, transformer_block, tp_mesh):
+        if tp_mesh.size() <= 1:
+            return
+        plan = {
+            "attention": PrepareModuleInput(
+                input_layouts=(Shard(1), None),
+                desired_input_layouts=(Replicate(), None),
+            ),
+            "attention.wq": ColwiseParallel(),
+            "attention.wk": ColwiseParallel(),
+            "attention.wv": ColwiseParallel(),
+            "attention.wo": RowwiseParallel(output_layouts=Shard(1)),
+            "attention_norm": SequenceParallel(),
+            "feed_forward": PrepareModuleInput(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+            ),
+            "feed_forward.w1": ColwiseParallel(),
+            "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)),
+            "feed_forward.w3": ColwiseParallel(),
+            "ffn_norm": SequenceParallel(),
+        }
+
+        # Adjust attention module to use the local number of heads
+        attn_layer = transformer_block.attention
+        attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size()
+        attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size()
+
+        # Apply the plan for the current transformer block
+        parallelize_module(transformer_block, tp_mesh, plan)
+
+    def parallel_embeddings(self, embedding, tp_mesh):
+        plan = {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            )
+        }
+        return parallelize_module(embedding, tp_mesh, plan)
+
+    def parallel_output(self, output, tp_mesh):
+        plan = {
+            "output": ColwiseParallel(
+                input_layouts=Shard(1),
+            ),
+        }
+        return parallelize_module(output, tp_mesh, plan)
+
+    def parallel_norm(self, norm, tp_mesh):
+        plan = {
+            "norm": SequenceParallel(),
+        }
+        return parallelize_module(norm, tp_mesh, plan)
+
     def reset_parameters(self):
         with torch.device(self.freqs_cis.device):
             self.freqs_cis = self._precompute_freqs_cis()
@@ -447,4 +536,4 @@ def from_model_args(cls, model_args: ModelArgs) -> "Transformer":
             Transformer: Transformer model.
 
         """
-        return cls(model_args)
+        return cls(model_args)
diff --git a/examples/distributed_inference/tensor_parallel_llama3.py b/examples/distributed_inference/tensor_parallel_llama3.py
@@ -1,82 +1,29 @@
+import os
+import time
+
 import torch
 import torch_tensorrt
-from llama3_model import Transformer, ModelArgs
+from llama3_model import ModelArgs, ParallelTransformer
 from torch.distributed._composable.fsdp import MixedPrecisionPolicy
 from torch.distributed._composable.fsdp.fully_shard import fully_shard
 from torch.distributed._tensor import Replicate, Shard
-from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper
-from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper,
+)
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     PrepareModuleInput,
     RowwiseParallel,
     SequenceParallel,
     parallelize_module,
 )
-import time
-from torch.distributed.device_mesh import init_device_mesh
-import os
 
 # Taken and modified pytorch lightening
 # https://lightning.ai/lightning-ai/studios/tensor-parallelism-supercharging-large-model-training-with-pytorch-lightning
-def parallelize(model: Transformer, tp_mesh: DeviceMesh) -> Transformer:
-    """Apply parallelisms and activation checkpointing to the model.
-
-    NOTE: The passed-in model preferably should be on meta device. Otherwise,
-    the model must fit on GPU or CPU memory.
-
-    """
 
-    if tp_mesh.size() > 1:
-        # 1. Parallelize the first embedding and the last linear proj layer
-        # 2. Parallelize the root norm layer over the sequence dim
-        # 3. Shard the first transformer block's inputs
 
-        # Parallelize the first embedding and the last linear out projection
-        plan = {
-            "tok_embeddings": RowwiseParallel(input_layouts=Replicate(),
-                                              output_layouts=Shard(1),),
-            "output": ColwiseParallel(
-                input_layouts=Shard(1),
-            ),
-            "norm": SequenceParallel(),
-        }
-        model = parallelize_module(model, tp_mesh, plan)
-
-        # Parallelize each transformer block
-        for transformer_block in model.layers.values():
-            plan = {
-                "attention": PrepareModuleInput(
-                    input_layouts=(Shard(1), None),
-                    desired_input_layouts=(Replicate(), None),
-                ),
-                "attention.wq": ColwiseParallel(),
-                "attention.wk": ColwiseParallel(),
-                "attention.wv": ColwiseParallel(),
-                "attention.wo": RowwiseParallel(output_layouts=Shard(1)),
-                "attention_norm": SequenceParallel(),
-                "feed_forward": PrepareModuleInput(
-                    input_layouts=(Shard(1),),
-                    desired_input_layouts=(Replicate(),),
-                ),
-                "feed_forward.w1": ColwiseParallel(),
-                "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)),
-                "feed_forward.w3": ColwiseParallel(),
-                "ffn_norm": SequenceParallel(),
-            }
-
-            # Adjust attention module to use the local number of heads
-            attn_layer = transformer_block.attention
-            attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size()
-            attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size()
-
-            # Apply the plan for the current transformer block
-            parallelize_module(transformer_block, tp_mesh, plan)
-
-    return model
-
-
-tp_size = 4
+tp_size = 8
 
 # understand world topology
 _rank = int(os.environ["RANK"])
@@ -85,29 +32,34 @@ def parallelize(model: Transformer, tp_mesh: DeviceMesh) -> Transformer:
 
 tp_mesh = init_device_mesh(device_type="cuda", mesh_shape=(_world_size,))
 
-model_args = ModelArgs(vocab_size=128256, dim=8192, n_layers=80, n_heads=64, rope_theta=500000.0, n_kv_heads=8)
-
-# model_args = ModelArgs(vocab_size=32000, dim=2048, n_layers=8, n_heads=32)
-model = Transformer(model_args).to("cuda")
-model = parallelize(model, tp_mesh)
-model.eval()
-torch.manual_seed(0)
-inp = torch.randint(32000, (8, 256), device="cuda")
-python_result = model(inp)
-torch_tensorrt.runtime.set_multi_device_safe_mode(True)
-model = torch.compile(
-    model,
-    fullgraph=True,
-    backend="torch_tensorrt",
-    options={
-        "truncate_long_and_double": True,
-        "enabled_precisions": {torch.float32, torch.float16},
-        "use_python_runtime": True,
-    },
-    dynamic=False,
+model_args = ModelArgs(
+    vocab_size=32000,
+    dim=8192,
+    n_layers=80,
+    n_heads=64,
+    rope_theta=500000.0,
+    n_kv_heads=8,
+    device="cuda",
 )
 
 with torch.no_grad():
+    model = ParallelTransformer(model_args, tp_mesh)
+    torch.manual_seed(0)
+    inp = torch.randint(32000, (8, 256), device="cuda")
+    python_result = model(inp)
+    torch_tensorrt.runtime.set_multi_device_safe_mode(True)
+    model = torch.compile(
+        model,
+        fullgraph=True,
+        backend="torch_tensorrt",
+        options={
+            "truncate_long_and_double": True,
+            "enabled_precisions": {torch.float32, torch.float16},
+            "use_python_runtime": True,
+            "workspace_size": 1 << 33,
+        },
+        dynamic=False,
+    )
     for i in range(15):
         # seeding with dp_rank to ensure identical inputs for TP groups
         torch.manual_seed(i)