-
Notifications
You must be signed in to change notification settings - Fork 200
Expand file tree
/
Copy pathnemotron_nano_v3_hellaswag.yaml
More file actions
115 lines (94 loc) · 2.98 KB
/
Copy pathnemotron_nano_v3_hellaswag.yaml
File metadata and controls
115 lines (94 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# To run this recipe:
# automodel examples/llm_finetune/nemotron/nemotron_nano_v3_hellaswag.yaml --nproc-per-node 8
# Adjust --nproc-per-node to the number of GPUs available on your machine.
recipe: TrainFinetuneRecipeForNextTokenPrediction
step_scheduler:
global_batch_size: 256
local_batch_size: 8
ckpt_every_steps: 1000
val_every_steps: 1000 # will run every x number of gradient steps
max_steps: 100
dist_env:
backend: nccl
timeout_minutes: 1
rng:
_target_: nemo_automodel.components.training.rng.StatefulRNG
seed: 1111
ranked: true
model:
_target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
trust_remote_code: true
checkpoint:
enabled: true
checkpoint_dir: checkpoints/
model_save_format: safetensors
save_consolidated: final
distributed:
strategy: fsdp2
dp_size: none
tp_size: 1
cp_size: 1
ep_size: 8
sequence_parallel: false
defer_fsdp_grad_sync: false
loss_fn:
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
dataset:
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
path_or_dataset: rowan/hellaswag
split: train
packed_sequence:
packed_sequence_size: 0
dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater
shuffle: True
validation_dataset:
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
path_or_dataset: rowan/hellaswag
split: validation
num_samples_limit: 64
validation_dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater
optimizer:
_target_: torch.optim.Adam
betas: [0.9, 0.999]
eps: 1e-8
lr: 1.0e-5
weight_decay: 0
lr_scheduler:
lr_decay_style: cosine
min_lr: 1.0e-6
ci:
vllm_deploy: true
vllm_smoke_test: true
recipe_owner: adil-a
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 7e-2
hf_device_map_auto: true
experts_implementation: grouped_mm
tokenizer_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
no_check_resume: true
dataset.num_samples_limit: 500
validation_dataset.num_samples_limit: 500
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>