NVIDIA-NeMo · rhmukundan · Jun 17, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 12, 2025
diff --git a/scripts/performance/llm/finetune_llama31_405b.py b/scripts/performance/llm/finetune_llama31_405b.py
@@ -18,18 +18,21 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 import nemo_run as run
 
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.llama31_405b import finetune_recipe, model
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_fp8_h100_h16384_tp4_mbs1_seqlen2048_lora,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
 from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
-from ..utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, isfile_train_pack_metadata
+from ..utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    prepare_squad_dataset_experiment,
+)
 
 HF_MODEL_URI = "meta-llama/Llama-3.1-405B"
 
@@ -39,6 +42,10 @@
 # downloaded from HuggingFace
 SKIP_IMPORT = False
 
+# Set this to True if dataset is already downloaded. If set to False,
+# dataset will be downloaded from HuggingFace
+SKIP_DATASET_DOWNLOAD = False
+
 
 def override_recipe_configs(
     args: str,
@@ -98,16 +105,7 @@ def override_recipe_configs(
     )
 
     # data module configs
-    if args.use_hf_tokenizer:
-        recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
-    else:
-        recipe.data.tokenizer = run.Config(
-            get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
-        )
-        recipe.model.tokenizer = recipe.data.tokenizer
-    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
-        # flag is valid only for SquadDataModule
-        recipe.data.force_redownload = True
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
     assert comm_overlap_callback_idx is not None, "MegatronCommOverlapCallback missing. Required for performance."
@@ -216,6 +214,10 @@ def override_recipe_configs(
         if not SKIP_IMPORT:
             assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
             exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        if not SKIP_DATASET_DOWNLOAD:
+            exp.add(
+                *prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
+            )
         exp.add(
             recipe,
             executor=executor,

diff --git a/scripts/performance/llm/finetune_llama3_70b.py b/scripts/performance/llm/finetune_llama3_70b.py
@@ -18,18 +18,21 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 import nemo_run as run
 
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_fp8_h100_h8192_tp2_mbs1_seqlen4096_lora,
 )
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
 from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
-from ..utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, isfile_train_pack_metadata
+from ..utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    prepare_squad_dataset_experiment,
+)
 
 HF_MODEL_URI = "meta-llama/Meta-Llama-3-70B"
 
@@ -39,6 +42,10 @@
 # downloaded from HuggingFace
 SKIP_IMPORT = False
 
+# Set this to True if dataset is already downloaded. If set to False,
+# dataset will be downloaded from HuggingFace
+SKIP_DATASET_DOWNLOAD = False
+
 
 def override_recipe_configs(
     args: str,
@@ -105,16 +112,7 @@ def override_recipe_configs(
     )
 
     # data module configs
-    if args.use_hf_tokenizer:
-        recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
-    else:
-        recipe.data.tokenizer = run.Config(
-            get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
-        )
-        recipe.model.tokenizer = recipe.data.tokenizer
-    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
-        # flag is valid only for SquadDataModule
-        recipe.data.force_redownload = True
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
     assert comm_overlap_callback_idx is not None, "MegatronCommOverlapCallback missing. Required for performance."
@@ -223,6 +221,11 @@ def override_recipe_configs(
         if not SKIP_IMPORT:
             assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
             exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        if not SKIP_DATASET_DOWNLOAD:
+            exp.add(
+                *prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
+            )
+
         exp.add(
             recipe,
             executor=executor,

diff --git a/scripts/performance/llm/finetune_llama3_8b.py b/scripts/performance/llm/finetune_llama3_8b.py
@@ -16,15 +16,13 @@
 
 import nemo_run as run
 
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
 from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
-from ..utils import hf_tokenizer, import_ckpt_experiment, isfile_train_pack_metadata
+from ..utils import hf_tokenizer, import_ckpt_experiment, prepare_squad_dataset_experiment
 
 HF_MODEL_URI = "meta-llama/Meta-Llama-3-8B"
 
@@ -34,6 +32,10 @@
 # downloaded from HuggingFace
 SKIP_IMPORT = False
 
+# Set this to True if dataset is already downloaded. If set to False,
+# dataset will be downloaded from HuggingFace
+SKIP_DATASET_DOWNLOAD = False
+
 
 def override_recipe_configs(
     args: str,
@@ -92,16 +94,7 @@ def override_recipe_configs(
     )
 
     # data module configs
-    if args.use_hf_tokenizer:
-        recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
-    else:
-        recipe.data.tokenizer = run.Config(
-            get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
-        )
-        recipe.model.tokenizer = recipe.data.tokenizer
-    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
-        # flag is valid only for SquadDataModule
-        recipe.data.force_redownload = True
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
 
     recipe.optim.config.use_distributed_optimizer = True
     recipe.model.config.disable_parameter_transpose_cache = True
@@ -157,6 +150,10 @@ def override_recipe_configs(
         if not SKIP_IMPORT:
             assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
             exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        if not SKIP_DATASET_DOWNLOAD:
+            exp.add(
+                *prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
+            )
         exp.add(
             recipe,
             executor=executor,

diff --git a/scripts/performance/llm/finetune_llama4_e128.py b/scripts/performance/llm/finetune_llama4_e128.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename, splitext
+
+import nemo_run as run
+
+from nemo.collections.llm.recipes.llama4_e128 import finetune_recipe, model
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+
+from ..argument_parser import parse_cli_args
+from ..utils import (
+    args_sanity_check,
+    get_user_configs,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    prepare_squad_dataset_experiment,
+    set_exp_logging_configs,
+    set_primary_perf_configs,
+    slurm_executor,
+)
+
+HF_MODEL_URI = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
+
+# Set this to True if checkpoint is available at 'NEMO_HOME'. If set to False,
+# extra Slurm job will be scheduled. In this case, if checkpoint is available
+# at 'NEMO_HOME', fine-tuning job will use this checkpoint, else, it will be
+# downloaded from HuggingFace
+SKIP_IMPORT = False
+
+# Set this to True if dataset is already downloaded. If set to False,
+# dataset will be downloaded from HuggingFace
+SKIP_DATASET_DOWNLOAD = False
+
+
+def override_recipe_configs(
+    args: str,
+    num_nodes: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: int,
+    ep_size: int,
+    etp_size: int,
+    enable_cuda_graphs: bool,
+    use_mcore_fsdp: bool,
+    recompute_layers: int,
+    activation_offload_layers: int,
+):
+    """
+    Llama4 e128 fine-tuning recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning
+
+    recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, packed_sequence=True)
+
+    recipe = set_primary_perf_configs(
+        recipe,
+        finetuning_scheme,
+        num_nodes,
+        args.gpus_per_node,
+        mbs,
+        gbs,
+        args.max_steps,
+        tp_size,
+        pp_size,
+        cp_size,
+        vp_size,
+        ep_size,
+        etp_size,
+        enable_cuda_graphs=enable_cuda_graphs,
+        use_mcore_fsdp=use_mcore_fsdp,
+        recompute_layers=recompute_layers,
+        activation_offload_layers=activation_offload_layers,
+        compute_dtype=args.compute_dtype,
+        fp8_recipe=args.fp8_recipe,
+    )
+
+    recipe = set_exp_logging_configs(
+        recipe,
+        finetuning_scheme,
+        "llm",
+        "llama4",
+        args.tensorboard,
+        args.wandb,
+        args.wandb_prj_name,
+        args.wandb_job_name,
+    )
+
+    # data module configs
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+
+    # Compute dtype configs
+    if args.compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+        recipe.trainer.plugins.grad_reduce_in_fp32 = False
+
+    recipe.model.config.cross_entropy_fusion_impl = "te"
+    recipe.model.config.cross_entropy_loss_fusion = True
+    recipe.model.config.apply_rope_fusion = True
+    recipe.model.config.moe_permute_fusion = True
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    args_sanity_check(args)
+
+    kwargs = get_user_configs(args.gpu.lower(), "sft", "llama4", "e128", args)
+    (
+        num_nodes,
+        mbs,
+        gbs,
+        tp_size,
+        pp_size,
+        cp_size,
+        vp_size,
+        ep_size,
+        etp_size,
+        enable_cuda_graphs,
+        use_mcore_fsdp,
+        recompute_layers,
+        activation_offload_layers,
+    ) = kwargs[0:13]
+
+    recipe = override_recipe_configs(
+        args,
+        num_nodes,
+        mbs,
+        gbs,
+        tp_size,
+        pp_size,
+        cp_size,
+        vp_size,
+        ep_size,
+        etp_size,
+        enable_cuda_graphs,
+        use_mcore_fsdp,
+        recompute_layers,
+        activation_offload_layers,
+    )
+    exp_config = (
+        f"{num_nodes}nodes_tp{tp_size}_pp{pp_size}_cp{cp_size}_vp{vp_size}_ep{ep_size}_etp{etp_size}_{mbs}mbs_{gbs}gbs"
+    )
+    exp_name = f"{splitext(basename(__file__))[0]}_{args.compute_dtype}_{exp_config}"
+
+    plugins = [
+        PerfEnvPlugin(
+            enable_vboost=True,
+            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
+            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
+        )
+    ]
+
+    if args.enable_nsys:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+    if args.enable_memory_profile:
+        assert args.memory_profile_out_path is not None
+        plugins.append(MemoryProfilePlugin(dir=args.memory_profile_out_path))
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        num_nodes,
+        args.gpus_per_node,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=args.custom_mounts,
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+        wandb_key=args.wandb_key,
+    )
+
+    with run.Experiment(exp_name) as exp:
+        if not SKIP_IMPORT:
+            assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
+            exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        if not SKIP_DATASET_DOWNLOAD:
+            exp.add(
+                *prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
+            )
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()