We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 0632bb8 commit 1f42205Copy full SHA for 1f42205
launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
@@ -52,9 +52,9 @@ model:
52
micro_batch_size: 1
53
global_batch_size: 256
54
rampup_batch_size: null
55
- tensor_model_parallel_size: 2
56
- pipeline_model_parallel_size: 1
57
- expert_model_parallel_size: 8
+ tensor_model_parallel_size: 8
+ pipeline_model_parallel_size: 4
+ expert_model_parallel_size: 1
58
virtual_pipeline_model_parallel_size: null
59
encoder_seq_length: 4096
60
max_position_embeddings: 32768
@@ -145,9 +145,7 @@ model:
145
- 0
146
gen_shape: false
147
optim:
148
- name: mcore_distributed_optim
149
- overlap_grad_sync: true
150
- overlap_param_sync: true
+ name: distributed_fused_adam
151
lr: 0.0001
152
weight_decay: 0.1
153
betas:
0 commit comments