Automodel/examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml at main · NVIDIA-NeMo/Automodel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# To run this recipe with 16 H100 nodes (128 GPUs)
#   torchrun --nproc-per-node 8 examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml --nproc-per-node 8


recipe: TrainFinetuneRecipeForNextTokenPrediction

seed: 1234

step_scheduler:
  global_batch_size: 256
  local_batch_size: 8
  ckpt_every_steps: 500
  val_every_steps: 500
  gc_every_steps: 10
  num_epochs: 1
  max_steps: 100

distributed:
  strategy: fsdp2
  tp_size: 1
  cp_size: 1
  pp_size: 4
  ep_size: 32

  sequence_parallel: false
  activation_checkpointing: false

  pipeline:
    pp_schedule: interleaved1f1b
    pp_microbatch_size: 1
    layers_per_stage: 2
    round_virtual_stages_to_pp_multiple: down
    scale_grads_in_schedule: false
    patch_inner_model: false
    patch_causal_lm_model: false

  moe:
    reshard_after_forward: false
    wrap_outer_model: false

dist_env:
  backend: nccl
  timeout_minutes: 30

model:
  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config
  config:
    _target_: nemo_automodel.components.models.deepseek_v4.config.DeepseekV4Config.from_pretrained
    pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
    name_or_path: deepseek-ai/DeepSeek-V4-Flash
    # Keep standard finetuning MTP-free by default. Set to 1 to enable
    # DeepSeek-V4 Flash MTP training with the auxiliary loss below.
    num_nextn_predict_layers: 0
  trust_remote_code: false
  load_base_model: true
  # DeepSeek-V4 uses 0.3 for most pretraining, then 0.1 during LR decay.
  # Keep finetuning/RL conservative unless explicitly reproducing pretraining.
  mtp_loss_scaling_factor: 0.1
  backend:
    _target_: nemo_automodel.components.models.common.BackendConfig
    attn: tilelang
    linear: torch
    rms_norm: torch_fp32
    rope_fusion: false
    dispatcher: hybridep
    experts: torch_mm
    enable_hf_state_dict_adapter: true
    enable_fsdp_optimizations: true

checkpoint:
  enabled: false
  dequantize_base_checkpoint: true

loss_fn:
  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy

dataset:
  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
  path_or_dataset: rowan/hellaswag
  split: train
  tokenizer:
    _target_: transformers.AutoTokenizer.from_pretrained
    pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash

packed_sequence:
  packed_sequence_size: 0

dataloader:
  _target_: torchdata.stateful_dataloader.StatefulDataLoader
  collate_fn:
    _target_: nemo_automodel.components.datasets.utils.default_collater
    pad_seq_len_divisible: 64
  shuffle: true

validation_dataset:
  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
  path_or_dataset: rowan/hellaswag
  split: validation
  tokenizer:
    _target_: transformers.AutoTokenizer.from_pretrained
    pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash

validation_dataloader:
  _target_: torchdata.stateful_dataloader.StatefulDataLoader
  collate_fn:
    _target_: nemo_automodel.components.datasets.utils.default_collater
    pad_seq_len_divisible: 64
  shuffle: false
  drop_last: true

optimizer:
  _target_: torch.optim.AdamW
  betas:
  - 0.9
  - 0.95
  eps: 1e-8
  lr: 1e-5
  weight_decay: 0.1

ci:
  # pp_size(4) * ep_size(32) = 128 GPUs => 16 nodes (8 H100/node).
  recipe_owner: hemildesai
  nodes: 16
  # Cold multi-node start + TileLang JIT runs past the 00:10:00 default.
  time: "00:30:00"