-
Notifications
You must be signed in to change notification settings - Fork 200
Expand file tree
/
Copy pathdeepseek_v4_flash_hellaswag.yaml
More file actions
139 lines (119 loc) · 3.9 KB
/
Copy pathdeepseek_v4_flash_hellaswag.yaml
File metadata and controls
139 lines (119 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# To run this recipe with 16 H100 nodes (128 GPUs)
# torchrun --nproc-per-node 8 examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml --nproc-per-node 8
recipe: TrainFinetuneRecipeForNextTokenPrediction
seed: 1234
step_scheduler:
global_batch_size: 256
local_batch_size: 8
ckpt_every_steps: 500
val_every_steps: 500
gc_every_steps: 10
num_epochs: 1
max_steps: 100
distributed:
strategy: fsdp2
tp_size: 1
cp_size: 1
pp_size: 4
ep_size: 32
sequence_parallel: false
activation_checkpointing: false
pipeline:
pp_schedule: interleaved1f1b
pp_microbatch_size: 1
layers_per_stage: 2
round_virtual_stages_to_pp_multiple: down
scale_grads_in_schedule: false
patch_inner_model: false
patch_causal_lm_model: false
moe:
reshard_after_forward: false
wrap_outer_model: false
dist_env:
backend: nccl
timeout_minutes: 30
model:
_target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config
config:
_target_: nemo_automodel.components.models.deepseek_v4.config.DeepseekV4Config.from_pretrained
pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
name_or_path: deepseek-ai/DeepSeek-V4-Flash
# Keep standard finetuning MTP-free by default. Set to 1 to enable
# DeepSeek-V4 Flash MTP training with the auxiliary loss below.
num_nextn_predict_layers: 0
trust_remote_code: false
load_base_model: true
# DeepSeek-V4 uses 0.3 for most pretraining, then 0.1 during LR decay.
# Keep finetuning/RL conservative unless explicitly reproducing pretraining.
mtp_loss_scaling_factor: 0.1
backend:
_target_: nemo_automodel.components.models.common.BackendConfig
attn: tilelang
linear: torch
rms_norm: torch_fp32
rope_fusion: false
dispatcher: hybridep
experts: torch_mm
enable_hf_state_dict_adapter: true
enable_fsdp_optimizations: true
checkpoint:
enabled: false
dequantize_base_checkpoint: true
loss_fn:
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
dataset:
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
path_or_dataset: rowan/hellaswag
split: train
tokenizer:
_target_: transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
packed_sequence:
packed_sequence_size: 0
dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn:
_target_: nemo_automodel.components.datasets.utils.default_collater
pad_seq_len_divisible: 64
shuffle: true
validation_dataset:
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
path_or_dataset: rowan/hellaswag
split: validation
tokenizer:
_target_: transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
validation_dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn:
_target_: nemo_automodel.components.datasets.utils.default_collater
pad_seq_len_divisible: 64
shuffle: false
drop_last: true
optimizer:
_target_: torch.optim.AdamW
betas:
- 0.9
- 0.95
eps: 1e-8
lr: 1e-5
weight_decay: 0.1
ci:
# pp_size(4) * ep_size(32) = 128 GPUs => 16 nodes (8 H100/node).
recipe_owner: hemildesai
nodes: 16
# Cold multi-node start + TileLang JIT runs past the 00:10:00 default.
time: "00:30:00"