Automodel/examples/llm_finetune/nemotron/nemotron_nano_v3_hellaswag_peft.yaml at main · NVIDIA-NeMo/Automodel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# To run this recipe:
#   automodel examples/llm_finetune/nemotron/nemotron_nano_v3_hellaswag_peft.yaml --nproc-per-node 8
# Adjust --nproc-per-node to the number of GPUs available on your machine.

recipe: TrainFinetuneRecipeForNextTokenPrediction

step_scheduler:
  global_batch_size: 8
  local_batch_size: 1
  ckpt_every_steps: 1000
  val_every_steps: 1000  # will run every x number of gradient steps
  max_steps: 100

dist_env:
  backend: nccl
  timeout_minutes: 1

rng:
  _target_: nemo_automodel.components.training.rng.StatefulRNG
  seed: 1111
  ranked: true

model:
  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
  pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
  backend:
    _target_: nemo_automodel.components.models.common.BackendConfig
    rms_norm: torch_fp32
    linear: torch

# torch.compile configuration
compile:
  enabled: false
  mode: "default"  # Options: "default", "reduce-overhead", "max-autotune"
  fullgraph: false
  dynamic: true  # Set to false for better performance with fixed shapes
  backend: null  # Use default backend (inductor)

checkpoint:
  enabled: true
  checkpoint_dir: checkpoints/
  model_save_format: safetensors
  save_consolidated: final

peft:
  _target_: nemo_automodel.components._peft.lora.PeftConfig
  exclude_modules: ["*.out_proj"]  # mamba layers use custom kernels that take in the out_proj.weight directly, thus lora doesn't work here.
  dim: 8
  alpha: 32
  use_triton: True

distributed:
  strategy: fsdp2
  dp_size: none
  tp_size: 1
  cp_size: 1
  ep_size: 4

  sequence_parallel: false

loss_fn:
  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy

dataset:
  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
  path_or_dataset: rowan/hellaswag
  split: train

packed_sequence:
  packed_sequence_size: 0

dataloader:
  _target_: torchdata.stateful_dataloader.StatefulDataLoader
  collate_fn: nemo_automodel.components.datasets.utils.default_collater
  shuffle: True

validation_dataset:
  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
  path_or_dataset: rowan/hellaswag
  split: validation
  num_samples_limit: 64

validation_dataloader:
  _target_: torchdata.stateful_dataloader.StatefulDataLoader
  collate_fn: nemo_automodel.components.datasets.utils.default_collater

optimizer:
  _target_: torch.optim.Adam
  betas: [0.9, 0.999]
  eps: 1e-8
  lr: 1.0e-5
  weight_decay: 0

lr_scheduler:
  lr_decay_style: cosine
  min_lr: 1.0e-6

ci:
  vllm_deploy: true
  vllm_smoke_test: true
  recipe_owner: adil-a
  time: "00:15:00"
  known_issue_id: AM-149
  checkpoint_robustness:
    hf_kl_threshold: 1e-1
    hf_device_map_auto: true
    experts_implementation: grouped_mm
    trust_remote_code: true
    tokenizer_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
    distributed.ep_size: 8
    no_check_resume: true
    dataset.num_samples_limit: 500
    validation_dataset.num_samples_limit: 500

# wandb:
#   project: <your_wandb_project>
#   entity: <your_wandb_entity>
#   name: <your_wandb_exp_name>
#   save_dir: <your_wandb_save_dir>