fix hf model dtype & prune embedding size

akoumpa · akoumpa · commit 3ea99d4b73ab · 2024-10-29T04:14:18.000-07:00
Signed-off-by: Alexandros Koumparoulis &lt;akoumparouli@nvidia.com&gt;
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
@@ -21,7 +21,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel, torch_dtype_from_mcore_config
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io, teardown
 from nemo.lightning.pytorch.utils import dtype_from_hf
@@ -295,16 +295,16 @@ def make_vocab_size_divisible_by(vocab_size):
 
 @io.model_exporter(LlamaModel, "hf")
 class HFLlamaExporter(io.ModelConnector[LlamaModel, "LlamaForCausalLM"]):
-    def init(self) -> "LlamaForCausalLM":
+    def init(self, dtype=torch.bfloat16) -> "LlamaForCausalLM":
         from transformers import AutoModelForCausalLM
         from transformers.modeling_utils import no_init_weights
 
         with no_init_weights(True):
-            return AutoModelForCausalLM.from_config(self.config)
+            return AutoModelForCausalLM.from_config(self.config, torch_dtype=dtype)
 
     def apply(self, output_path: Path) -> Path:
-        target = self.init()
         source, _ = self.nemo_load(str(self))
+        target = self.init(torch_dtype_from_mcore_config(source.config))
         target = self.convert_state(source, target)
 
         target = target.cpu()
@@ -321,10 +321,9 @@ def convert_state(self, source, target):
             "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
             "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
             "decoder.final_layernorm.weight": "model.norm.weight",
-            "output_layer.weight": "lm_head.weight",
         }
 
-        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1, _export_embedding, _export_head])
 
     @property
     def tokenizer(self):
@@ -426,6 +425,26 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
     return q_proj, k_proj, v_proj
 
 
+@io.state_transform(
+    source_key="embedding.word_embeddings.weight",
+    target_key="model.embed_tokens.weight",
+)
+def _export_embedding(ctx: io.TransformCTX, embedding):
+    megatron_config = ctx.target.config
+    # prune padding.
+    return embedding[:megatron_config.vocab_size, :]
+
+
+@io.state_transform(
+    source_key="output_layer.weight",
+    target_key="lm_head.weight",
+)
+def _export_head(ctx: io.TransformCTX, embedding):
+    megatron_config = ctx.target.config
+    # prune padding.
+    return embedding[:megatron_config.vocab_size, :]
+
+
 @io.state_transform(
     source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
     target_key="decoder.layers.*.mlp.linear_fc1.weight",
@@ -443,6 +462,15 @@ def _export_linear_fc1(linear_fc1):
 
     return gate_proj, up_proj
 
+# @io.state_transform(
+#     source_key="decoder.layers.*.mlp.linear_fc1.weight",
+#     target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+# )
+# def _export_embedding(linear_fc1):
+#     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+#     return gate_proj, up_proj
+
 
 def apply_rope_scaling(
     inv_freq,