[distributed] add stage metrics - total params per stage, total size and present it in a nicely formatted manner (#1120)

lessw2020 · web-flow · commit 5986ed2d1595 · 2024-09-11T11:53:57.000-07:00
* add stage metrics - total params per stage, total size

* PR feedback

* PR feedback, typing
diff --git a/dist_run.py b/dist_run.py
@@ -25,7 +25,15 @@
     load_safetensor_weights,
 )
 
-from distributed.utils import Color as color, TrackTime, CUDATrackTime, GPUMemoryMonitor
+from distributed.utils import (
+    Color as color,
+    GPUMemoryMonitor,
+    get_module_size,
+    get_num_params,
+    bytes_to_readable,
+    TrackTime, 
+    CUDATrackTime,
+)
 
 from distributed.verification_utils import find_cpu_tensors
 from torchchat.cli.builder import TokenizerArgs, _initialize_tokenizer
@@ -193,11 +201,17 @@ def main():
     logger.info(f"Loading weights for {pp_rank=} on {device=}")
     with TrackTime("cuda") as timer:
         _load_model_weights(model, hf_model_name, device=device, model_config=config)
-
     logger.info(
         f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for stage {rank}{color.reset}"
     )
-    
+
+    # info on stage size and params
+    stage_size = get_module_size(model)
+    stage_size_formatted = bytes_to_readable(stage_size)
+    stage_num_params = get_num_params(model)
+    logger.info(
+        f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}\n"
+    )
     
     # Setup input position
     # input_pos for prefill: a list of increasing integers from 0 to seqlen
diff --git a/distributed/utils.py b/distributed/utils.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import itertools
 import os
 from dataclasses import dataclass
 from datetime import timedelta
@@ -54,7 +55,47 @@ def get_num_params(model: torch.nn.Module, exclude_embedding: bool = False) -> i
     num_params = sum(p.numel() for p in model.parameters())
     if exclude_embedding:
         num_params -= model.tok_embeddings.weight.numel()
-    return num_params
+    readable_num_params = format_model_params(num_params)
+    return readable_num_params
+
+
+def get_module_size(stage: torch.nn.Module) -> int:
+    """get module (stage) size in bytes"""
+    model_size = sum(
+        [
+            p.numel() * p.dtype.itemsize
+            for p in itertools.chain(stage.parameters(), stage.buffers())
+        ]
+    )
+    return model_size
+
+
+def format_model_params(params):
+    """turn the num_params into a readable formatted number"""
+    if params >= 1_000_000_000:
+        return f"{params / 1_000_000_000:.2f}B"
+    elif params >= 1_000_000:
+        return f"{params / 1_000_000:.2f}M"
+    else:
+        return f"{params:,}"
+
+
+def bytes_to_readable(bytes_value: int, round_to: int = 2) -> str:
+    """formatting function to make reading model (stage) sizes easy"""
+    GiB = 1024**3  # 1 GiB in bytes
+    MiB = 1024**2  # 1 MiB in bytes
+
+    if bytes_value >= GiB:
+        value = bytes_value / GiB
+        unit = "GiB"
+    else:
+        value = bytes_value / MiB
+        unit = "MiB"
+
+    # Round to 2 decimal places
+    rounded_value = round(value, round_to)
+
+    return f"{rounded_value} {unit}"
 
 
 @dataclass(frozen=True)