Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
883da22
Fix for Squad Dataset Download
rhmukundan Jun 11, 2025
9c3acd5
Giving the option to pass the sequence length from the finetune script
rhmukundan Jun 11, 2025
d4acb9c
Rebase Pushing llama4 finetuning e128 script and llama3 70b finetunin…
rhmukundan Jun 12, 2025
fec6d78
Finetune Llama4 Recipe with dataset download fix
rhmukundan Jun 12, 2025
8ee14a9
Address PR comments
rhmukundan Jun 14, 2025
fdb5487
Tweaks to finetune_llama4_e128
rhmukundan Jun 15, 2025
adbb91d
Addressing PR comments
rhmukundan Jun 16, 2025
e27360c
Giving an option to have either AutoTokenizer or NullTokenizer for pr…
rhmukundan Jun 16, 2025
794e1b2
Fix kwargs
rhmukundan Jun 16, 2025
619b167
User passing vocab_size while using the NullTokenizer for downloading…
rhmukundan Jun 16, 2025
04e2364
Adding model configs for finetune llama4
rhmukundan Jun 16, 2025
830dce5
Rebase Introducing the fix to llama3 finetuning recipes as well
rhmukundan Jun 16, 2025
9976083
Setting default vocab_size to None in prepare_squad_dataset_experimen…
rhmukundan Jun 16, 2025
d133fd8
Fix merge conflicts
rhmukundan Jun 16, 2025
452399d
Fixing the search condition for the dataset
rhmukundan Jun 16, 2025
3deddf4
Apply isort and black reformatting
rhmukundan Jun 16, 2025
766f71f
Removing NullTokenizer from Finetuning scripts
rhmukundan Jun 17, 2025
a95713b
Import cleanup
rhmukundan Jun 17, 2025
0673cd2
Apply isort and black reformatting
rhmukundan Jun 17, 2025
777b5ed
Merge branch 'main' into rhmukundan/fix-squad-dataset-download
rhmukundan Jun 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions scripts/performance/llm/finetune_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,21 @@
import fiddle._src.experimental.dataclasses as fdl_dc
import nemo_run as run

from nemo.collections.llm.gpt.data.squad import SquadDataModule
from nemo.collections.llm.recipes.llama31_405b import finetune_recipe, model
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_fp8_h100_h16384_tp4_mbs1_seqlen2048_lora,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
from ..executors import slurm_executor
from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
from ..utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, isfile_train_pack_metadata
from ..utils import (
get_comm_overlap_callback_idx,
hf_tokenizer,
import_ckpt_experiment,
prepare_squad_dataset_experiment,
)

HF_MODEL_URI = "meta-llama/Llama-3.1-405B"

Expand All @@ -39,6 +42,10 @@
# downloaded from HuggingFace
SKIP_IMPORT = False

# Set this to True if dataset is already downloaded. If set to False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not very clear; previously without this PR , are we not downloading the dataset from huggingface? My impression is it's still done somewhere in the dataset building process, just not explicitly, right? I think the difference is here you are separating it out as a new nemo-run experiment. If comment like this, users would think setting it to True without a local file will error out, but in reality it won't?

Could you further explain what different things are happening between False and True here;

# dataset will be downloaded from HuggingFace
SKIP_DATASET_DOWNLOAD = False


def override_recipe_configs(
args: str,
Expand Down Expand Up @@ -98,16 +105,7 @@ def override_recipe_configs(
)

# data module configs
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
# flag is valid only for SquadDataModule
recipe.data.force_redownload = True
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)

comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
assert comm_overlap_callback_idx is not None, "MegatronCommOverlapCallback missing. Required for performance."
Expand Down Expand Up @@ -216,6 +214,10 @@ def override_recipe_configs(
if not SKIP_IMPORT:
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
if not SKIP_DATASET_DOWNLOAD:
exp.add(
*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
)
exp.add(
recipe,
executor=executor,
Expand Down
29 changes: 16 additions & 13 deletions scripts/performance/llm/finetune_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,21 @@
import fiddle._src.experimental.dataclasses as fdl_dc
import nemo_run as run

from nemo.collections.llm.gpt.data.squad import SquadDataModule
from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_fp8_h100_h8192_tp2_mbs1_seqlen4096_lora,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
from ..executors import slurm_executor
from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
from ..utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, isfile_train_pack_metadata
from ..utils import (
get_comm_overlap_callback_idx,
hf_tokenizer,
import_ckpt_experiment,
prepare_squad_dataset_experiment,
)

HF_MODEL_URI = "meta-llama/Meta-Llama-3-70B"

Expand All @@ -39,6 +42,10 @@
# downloaded from HuggingFace
SKIP_IMPORT = False

# Set this to True if dataset is already downloaded. If set to False,
# dataset will be downloaded from HuggingFace
SKIP_DATASET_DOWNLOAD = False


def override_recipe_configs(
args: str,
Expand Down Expand Up @@ -105,16 +112,7 @@ def override_recipe_configs(
)

# data module configs
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
# flag is valid only for SquadDataModule
recipe.data.force_redownload = True
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)

comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
assert comm_overlap_callback_idx is not None, "MegatronCommOverlapCallback missing. Required for performance."
Expand Down Expand Up @@ -223,6 +221,11 @@ def override_recipe_configs(
if not SKIP_IMPORT:
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
if not SKIP_DATASET_DOWNLOAD:
exp.add(
*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
)

exp.add(
recipe,
executor=executor,
Expand Down
23 changes: 10 additions & 13 deletions scripts/performance/llm/finetune_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,13 @@

import nemo_run as run

from nemo.collections.llm.gpt.data.squad import SquadDataModule
from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
from ..executors import slurm_executor
from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
from ..utils import hf_tokenizer, import_ckpt_experiment, isfile_train_pack_metadata
from ..utils import hf_tokenizer, import_ckpt_experiment, prepare_squad_dataset_experiment

HF_MODEL_URI = "meta-llama/Meta-Llama-3-8B"

Expand All @@ -34,6 +32,10 @@
# downloaded from HuggingFace
SKIP_IMPORT = False

# Set this to True if dataset is already downloaded. If set to False,
# dataset will be downloaded from HuggingFace
SKIP_DATASET_DOWNLOAD = False


def override_recipe_configs(
args: str,
Expand Down Expand Up @@ -92,16 +94,7 @@ def override_recipe_configs(
)

# data module configs
if args.use_hf_tokenizer:
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
else:
recipe.data.tokenizer = run.Config(
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
)
recipe.model.tokenizer = recipe.data.tokenizer
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
# flag is valid only for SquadDataModule
recipe.data.force_redownload = True
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)

recipe.optim.config.use_distributed_optimizer = True
recipe.model.config.disable_parameter_transpose_cache = True
Expand Down Expand Up @@ -157,6 +150,10 @@ def override_recipe_configs(
if not SKIP_IMPORT:
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
if not SKIP_DATASET_DOWNLOAD:
exp.add(
*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
)
exp.add(
recipe,
executor=executor,
Expand Down
209 changes: 209 additions & 0 deletions scripts/performance/llm/finetune_llama4_e128.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from os.path import basename, splitext

import nemo_run as run

from nemo.collections.llm.recipes.llama4_e128 import finetune_recipe, model
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin

from ..argument_parser import parse_cli_args
from ..utils import (
args_sanity_check,
get_user_configs,
hf_tokenizer,
import_ckpt_experiment,
prepare_squad_dataset_experiment,
set_exp_logging_configs,
set_primary_perf_configs,
slurm_executor,
)

HF_MODEL_URI = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"

# Set this to True if checkpoint is available at 'NEMO_HOME'. If set to False,
# extra Slurm job will be scheduled. In this case, if checkpoint is available
# at 'NEMO_HOME', fine-tuning job will use this checkpoint, else, it will be
# downloaded from HuggingFace
SKIP_IMPORT = False

# Set this to True if dataset is already downloaded. If set to False,
# dataset will be downloaded from HuggingFace
SKIP_DATASET_DOWNLOAD = False


def override_recipe_configs(
args: str,
num_nodes: int,
mbs: int,
gbs: int,
tp_size: int,
pp_size: int,
cp_size: int,
vp_size: int,
ep_size: int,
etp_size: int,
enable_cuda_graphs: bool,
use_mcore_fsdp: bool,
recompute_layers: int,
activation_offload_layers: int,
):
"""
Llama4 e128 fine-tuning recipe aimed at achieving best possible performance.

NOTE: Use fp8 precision training with caution. It might not give desirable results.
"""
finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning

recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, packed_sequence=True)

recipe = set_primary_perf_configs(
recipe,
finetuning_scheme,
num_nodes,
args.gpus_per_node,
mbs,
gbs,
args.max_steps,
tp_size,
pp_size,
cp_size,
vp_size,
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
)

recipe = set_exp_logging_configs(
recipe,
finetuning_scheme,
"llm",
"llama4",
args.tensorboard,
args.wandb,
args.wandb_prj_name,
args.wandb_job_name,
)

# data module configs
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)

# Compute dtype configs
if args.compute_dtype.lower() == "fp8":
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

recipe.model.config.cross_entropy_fusion_impl = "te"
recipe.model.config.cross_entropy_loss_fusion = True
recipe.model.config.apply_rope_fusion = True
recipe.model.config.moe_permute_fusion = True
return recipe


if __name__ == "__main__":
args = parse_cli_args().parse_args()
args_sanity_check(args)

kwargs = get_user_configs(args.gpu.lower(), "sft", "llama4", "e128", args)
(
num_nodes,
mbs,
gbs,
tp_size,
pp_size,
cp_size,
vp_size,
ep_size,
etp_size,
enable_cuda_graphs,
use_mcore_fsdp,
recompute_layers,
activation_offload_layers,
) = kwargs[0:13]

recipe = override_recipe_configs(
args,
num_nodes,
mbs,
gbs,
tp_size,
pp_size,
cp_size,
vp_size,
ep_size,
etp_size,
enable_cuda_graphs,
use_mcore_fsdp,
recompute_layers,
activation_offload_layers,
)
exp_config = (
f"{num_nodes}nodes_tp{tp_size}_pp{pp_size}_cp{cp_size}_vp{vp_size}_ep{ep_size}_etp{etp_size}_{mbs}mbs_{gbs}gbs"
)
exp_name = f"{splitext(basename(__file__))[0]}_{args.compute_dtype}_{exp_config}"

plugins = [
PerfEnvPlugin(
enable_vboost=True,
nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
)
]

if args.enable_nsys:
plugins.append(NsysPlugin(start_step=5, end_step=6))
if args.enable_memory_profile:
assert args.memory_profile_out_path is not None
plugins.append(MemoryProfilePlugin(dir=args.memory_profile_out_path))

executor = slurm_executor(
args.account,
args.partition,
args.log_dir,
num_nodes,
args.gpus_per_node,
args.time_limit,
args.container_image,
custom_mounts=args.custom_mounts,
custom_env_vars={},
hf_token=args.hf_token,
nemo_home=args.nemo_home,
wandb_key=args.wandb_key,
)

with run.Experiment(exp_name) as exp:
if not SKIP_IMPORT:
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
if not SKIP_DATASET_DOWNLOAD:
exp.add(
*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home)
)
exp.add(
recipe,
executor=executor,
name=exp_name,
plugins=plugins,
)
if not args.dryrun:
exp.run(sequential=True, detach=True)
else:
exp.dryrun()
Loading
Loading