-
Notifications
You must be signed in to change notification settings - Fork 200
Expand file tree
/
Copy pathnemotron_nano_v3_hellaswag_peft.yaml
More file actions
135 lines (112 loc) · 3.64 KB
/
Copy pathnemotron_nano_v3_hellaswag_peft.yaml
File metadata and controls
135 lines (112 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# To run this recipe:
# automodel examples/llm_finetune/nemotron/nemotron_nano_v3_hellaswag_peft.yaml --nproc-per-node 8
# Adjust --nproc-per-node to the number of GPUs available on your machine.
recipe: TrainFinetuneRecipeForNextTokenPrediction
step_scheduler:
global_batch_size: 8
local_batch_size: 1
ckpt_every_steps: 1000
val_every_steps: 1000 # will run every x number of gradient steps
max_steps: 100
dist_env:
backend: nccl
timeout_minutes: 1
rng:
_target_: nemo_automodel.components.training.rng.StatefulRNG
seed: 1111
ranked: true
model:
_target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
backend:
_target_: nemo_automodel.components.models.common.BackendConfig
rms_norm: torch_fp32
linear: torch
# torch.compile configuration
compile:
enabled: false
mode: "default" # Options: "default", "reduce-overhead", "max-autotune"
fullgraph: false
dynamic: true # Set to false for better performance with fixed shapes
backend: null # Use default backend (inductor)
checkpoint:
enabled: true
checkpoint_dir: checkpoints/
model_save_format: safetensors
save_consolidated: final
peft:
_target_: nemo_automodel.components._peft.lora.PeftConfig
exclude_modules: ["*.out_proj"] # mamba layers use custom kernels that take in the out_proj.weight directly, thus lora doesn't work here.
dim: 8
alpha: 32
use_triton: True
distributed:
strategy: fsdp2
dp_size: none
tp_size: 1
cp_size: 1
ep_size: 4
sequence_parallel: false
loss_fn:
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
dataset:
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
path_or_dataset: rowan/hellaswag
split: train
packed_sequence:
packed_sequence_size: 0
dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater
shuffle: True
validation_dataset:
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
path_or_dataset: rowan/hellaswag
split: validation
num_samples_limit: 64
validation_dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater
optimizer:
_target_: torch.optim.Adam
betas: [0.9, 0.999]
eps: 1e-8
lr: 1.0e-5
weight_decay: 0
lr_scheduler:
lr_decay_style: cosine
min_lr: 1.0e-6
ci:
vllm_deploy: true
vllm_smoke_test: true
recipe_owner: adil-a
time: "00:15:00"
known_issue_id: AM-149
checkpoint_robustness:
hf_kl_threshold: 1e-1
hf_device_map_auto: true
experts_implementation: grouped_mm
trust_remote_code: true
tokenizer_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
distributed.ep_size: 8
no_check_resume: true
dataset.num_samples_limit: 500
validation_dataset.num_samples_limit: 500
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>