|
35 | 35 | # fields (num_hash_layers, swiglu_limit, attn_sink dims) are picked up from the |
36 | 36 | # checkpoint instead of relying on the Config defaults. |
37 | 37 | moe_cfg = DeepSeekV4Config.from_hf(DEEPSEEK_V4_PATH) |
38 | | -moe_cfg.num_hidden_layers = 4 |
| 38 | +# moe_cfg.num_hidden_layers = 4 |
39 | 39 | # V4 MTP forward is not wired yet (DeepSeekV4.build_mtp_block returns None), but |
40 | 40 | # from_hf sets mtp_config from the release's num_nextn_predict_layers=1. Left as-is, |
41 | 41 | # MoE.build_loss_ctx_batch keys off `mtp_config is not None` and builds MTP loss |
|
90 | 90 | # (slower; see DSA._resolve_sparse_attn_fn). Must match the DataloaderConfig |
91 | 91 | # pack_max_length below. |
92 | 92 | moe_cfg.attention.pack_max_length = pack_max_length |
| 93 | +moe_cfg.attention.indexer_backend = "triton" |
93 | 94 | # Compile is now safe — cutlass group_gemm is annotated with @torch.library.custom_op |
94 | 95 | # (compile-friendly), and HC + DSA helpers are pure-Tensor. |
95 | 96 | # Temporarily disabled: under pack=8192 + intra_layer_micro_batch=1 + |
96 | 97 | # recompute_ratio=1.0 some backward path allocates a 130 GiB fp32 tensor. |
97 | 98 | # The 06:00 run with compile_cfg=False reached step 50 at max_mem 114 GB so |
98 | 99 | # the baseline fits — debug what compile_cfg=True is changing in the eager |
99 | 100 | # code path that adds 130 GB on top. |
100 | | -moe_cfg.compile_cfg = False |
| 101 | +moe_cfg.compile_cfg = True |
101 | 102 |
|
102 | | -optim_cfg = AdamWConfig(lr=6e-05) |
| 103 | +optim_cfg = AdamWConfig(lr=6e-05,) |
103 | 104 | lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) |
104 | 105 | fsdp_cfg = FSDPConfig( |
105 | 106 | # `FSDPConfig.torch_compile` is deprecated (1.1.0) and now acts as a master |
|
150 | 151 | lr_cfg=lr_cfg, |
151 | 152 | loss_cfg=loss_cfg, |
152 | 153 | tokenizer_path=DEEPSEEK_V4_PATH, |
153 | | - global_batch_size=16, |
| 154 | + global_batch_size=128, |
154 | 155 | work_dir="/mnt/shared-storage-user/yehaochen/tmp", |
155 | 156 | seed=0, |
156 | 157 | strict_load=False, |
|
0 commit comments