Skip to content

Commit 766f71f

Browse files
committed
Removing NullTokenizer from Finetuning scripts
Signed-off-by: Raghav Hrishikeshan Mukundan <rmukundan@nvidia.com>
1 parent 3deddf4 commit 766f71f

File tree

5 files changed

+12
-88
lines changed

5 files changed

+12
-88
lines changed

scripts/performance/llm/finetune_llama31_405b.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,7 @@ def override_recipe_configs(
108108
)
109109

110110
# data module configs
111-
if args.use_hf_tokenizer:
112-
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
113-
else:
114-
recipe.data.tokenizer = run.Config(
115-
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
116-
)
117-
recipe.model.tokenizer = recipe.data.tokenizer
118-
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
119-
# flag is valid only for SquadDataModule
120-
recipe.data.force_redownload = True
111+
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
121112

122113
comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
123114
assert comm_overlap_callback_idx is not None, "MegatronCommOverlapCallback missing. Required for performance."
@@ -227,11 +218,7 @@ def override_recipe_configs(
227218
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
228219
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
229220
if not SKIP_DATASET_DOWNLOAD:
230-
exp.add(
231-
*prepare_squad_dataset_experiment(
232-
executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home, vocab_size=128256
233-
)
234-
)
221+
exp.add(*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home))
235222
exp.add(
236223
recipe,
237224
executor=executor,

scripts/performance/llm/finetune_llama3_70b.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -115,16 +115,7 @@ def override_recipe_configs(
115115
)
116116

117117
# data module configs
118-
if args.use_hf_tokenizer:
119-
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
120-
else:
121-
recipe.data.tokenizer = run.Config(
122-
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
123-
)
124-
recipe.model.tokenizer = recipe.data.tokenizer
125-
# if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
126-
# # flag is valid only for SquadDataModule
127-
# recipe.data.force_redownload = True
118+
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
128119

129120
comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
130121
assert comm_overlap_callback_idx is not None, "MegatronCommOverlapCallback missing. Required for performance."
@@ -234,11 +225,7 @@ def override_recipe_configs(
234225
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
235226
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
236227
if not SKIP_DATASET_DOWNLOAD:
237-
exp.add(
238-
*prepare_squad_dataset_experiment(
239-
executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home, vocab_size=128256
240-
)
241-
)
228+
exp.add(*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home))
242229

243230
exp.add(
244231
recipe,

scripts/performance/llm/finetune_llama3_8b.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -102,16 +102,7 @@ def override_recipe_configs(
102102
)
103103

104104
# data module configs
105-
if args.use_hf_tokenizer:
106-
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
107-
else:
108-
recipe.data.tokenizer = run.Config(
109-
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=128256
110-
)
111-
recipe.model.tokenizer = recipe.data.tokenizer
112-
if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
113-
# flag is valid only for SquadDataModule
114-
recipe.data.force_redownload = True
105+
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
115106

116107
recipe.optim.config.use_distributed_optimizer = True
117108
recipe.model.config.disable_parameter_transpose_cache = True
@@ -168,11 +159,7 @@ def override_recipe_configs(
168159
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
169160
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
170161
if not SKIP_DATASET_DOWNLOAD:
171-
exp.add(
172-
*prepare_squad_dataset_experiment(
173-
executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home, vocab_size=128256
174-
)
175-
)
162+
exp.add(*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home))
176163
exp.add(
177164
recipe,
178165
executor=executor,

scripts/performance/llm/finetune_llama4_e128.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,7 @@ def override_recipe_configs(
108108
)
109109

110110
# data module configs
111-
if args.use_hf_tokenizer:
112-
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
113-
else:
114-
recipe.data.tokenizer = run.Config(
115-
get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=202048
116-
)
117-
recipe.model.tokenizer = recipe.data.tokenizer
118-
# #If you want to force redownload for SquadDataModule, uncomment and adjust the following:
119-
# if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
120-
# SKIP_DATASET_DOWNLOAD = True
111+
recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
121112

122113
# Compute dtype configs
123114
if args.compute_dtype.lower() == "fp8":
@@ -201,16 +192,7 @@ def override_recipe_configs(
201192
assert args.hf_token is not None, "HF token is required for importing checkpoint from HuggingFace"
202193
exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
203194
if not SKIP_DATASET_DOWNLOAD:
204-
exp.add(
205-
*prepare_squad_dataset_experiment(
206-
executor,
207-
HF_MODEL_URI,
208-
seq_length=4096,
209-
nemo_home=args.nemo_home,
210-
use_hf_tokenizer=args.use_hf_tokenizer,
211-
vocab_size=202048,
212-
)
213-
)
195+
exp.add(*prepare_squad_dataset_experiment(executor, HF_MODEL_URI, seq_length=4096, nemo_home=args.nemo_home))
214196
exp.add(
215197
recipe,
216198
executor=executor,

scripts/performance/utils.py

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -91,39 +91,28 @@ def get_nemo_home(nemo_home=None):
9191
if env_nemo_set:
9292
return os.environ["NEMO_HOME"]
9393

94-
raise ValueError("Neither nemo_home argument nor NEMO_HOME environment variable is set")
94+
raise ValueError("Neither -nh/--nemo_home argument nor NEMO_HOME environment variable is set")
9595

9696

97-
def prepare_squad_dataset(
98-
model_name: str, seq_length: int = 2048, nemo_home=None, use_hf_tokenizer=True, vocab_size=None
99-
):
97+
def prepare_squad_dataset(model_name: str, seq_length: int = 2048, nemo_home=None):
10098
"""Prepare the SQuAD dataset for fine-tuning.
10199
102100
Args:
103101
model_name (str): The name of the model
104102
seq_length (int): The sequence length to use for packing. Defaults to 2048.
105103
nemo_home: Optional path to NEMO home directory set via args.nemo_home
106-
use_hf_tokenizer: Whether to use HuggingFace tokenizer or NullTokenizer
107-
vocab_size: Vocabulary size to use when use_hf_tokenizer is False. Required when use_hf_tokenizer is False.
108104
"""
109105
from pathlib import Path
110106

111107
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
112-
from nemo.collections.common.tokenizers.null_tokenizer import NullTokenizer
113108
from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
114109
from nemo.collections.llm.gpt.data.squad import SquadDataModule
115110

116-
if not use_hf_tokenizer and vocab_size is None:
117-
raise ValueError("vocab_size must be provided when use_hf_tokenizer is False")
118-
119111
nemo_home_path = Path(get_nemo_home(nemo_home))
120112
dataset_root = nemo_home_path / "datasets" / "squad"
121113
dataset_root.mkdir(parents=True, exist_ok=True)
122114

123-
if use_hf_tokenizer:
124-
tokenizer = AutoTokenizer(pretrained_model_name=model_name)
125-
else:
126-
tokenizer = NullTokenizer(vocab_size=vocab_size)
115+
tokenizer = AutoTokenizer(pretrained_model_name=model_name)
127116

128117
# Configure SquadDataModule with packing specs
129118
datamodule = SquadDataModule(
@@ -150,14 +139,7 @@ def prepare_squad_dataset(
150139
raise FileNotFoundError(f"Packed dataset dir not found at {packed_dir}. Dataset download failed")
151140

152141

153-
def prepare_squad_dataset_experiment(
154-
executor: run.SlurmExecutor,
155-
model_name: str,
156-
seq_length: int = 2048,
157-
nemo_home=None,
158-
use_hf_tokenizer=True,
159-
vocab_size=None,
160-
):
142+
def prepare_squad_dataset_experiment(executor: run.SlurmExecutor, model_name: str, seq_length: int = 2048, nemo_home=None):
161143
"""
162144
Downloads and prepares the SQuAD dataset for fine-tuning.
163145
"""
@@ -173,7 +155,6 @@ def prepare_squad_dataset_experiment(
173155
model_name=model_name,
174156
seq_length=seq_length,
175157
nemo_home=nemo_home,
176-
use_hf_tokenizer=use_hf_tokenizer,
177158
),
178159
dataset_executor,
179160
"prepare_squad_dataset_exp",

0 commit comments

Comments
 (0)