[CI] deepseek_v4_flash: run the full stack — triton indexer, compile, bs128

HAOCHENYE · HAOCHENYE · commit 39b83b563224 · 2026-06-27T05:43:18.000Z
Switch the V4-Flash reference config from the 4-layer smoke setup to the full
run: drop the ``num_hidden_layers = 4`` cap (use the release's 43 layers), select
the fused Triton indexer top-k backend (``indexer_backend = "triton"``), turn
``compile_cfg`` on, and raise ``global_batch_size`` 16 -&gt; 128.
diff --git a/ci/config/deepseek_v4_flash.py b/ci/config/deepseek_v4_flash.py
@@ -35,7 +35,7 @@
 # fields (num_hash_layers, swiglu_limit, attn_sink dims) are picked up from the
 # checkpoint instead of relying on the Config defaults.
 moe_cfg = DeepSeekV4Config.from_hf(DEEPSEEK_V4_PATH)
-moe_cfg.num_hidden_layers = 4
+# moe_cfg.num_hidden_layers = 4
 # V4 MTP forward is not wired yet (DeepSeekV4.build_mtp_block returns None), but
 # from_hf sets mtp_config from the release's num_nextn_predict_layers=1. Left as-is,
 # MoE.build_loss_ctx_batch keys off `mtp_config is not None` and builds MTP loss
@@ -90,16 +90,17 @@
 # (slower; see DSA._resolve_sparse_attn_fn). Must match the DataloaderConfig
 # pack_max_length below.
 moe_cfg.attention.pack_max_length = pack_max_length
+moe_cfg.attention.indexer_backend = "triton"
 # Compile is now safe — cutlass group_gemm is annotated with @torch.library.custom_op
 # (compile-friendly), and HC + DSA helpers are pure-Tensor.
 # Temporarily disabled: under pack=8192 + intra_layer_micro_batch=1 +
 # recompute_ratio=1.0 some backward path allocates a 130 GiB fp32 tensor.
 # The 06:00 run with compile_cfg=False reached step 50 at max_mem 114 GB so
 # the baseline fits — debug what compile_cfg=True is changing in the eager
 # code path that adds 130 GB on top.
-moe_cfg.compile_cfg = False
+moe_cfg.compile_cfg = True
 
-optim_cfg = AdamWConfig(lr=6e-05)
+optim_cfg = AdamWConfig(lr=6e-05,)
 lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
 fsdp_cfg = FSDPConfig(
     # `FSDPConfig.torch_compile` is deprecated (1.1.0) and now acts as a master
@@ -150,7 +151,7 @@
     lr_cfg=lr_cfg,
     loss_cfg=loss_cfg,
     tokenizer_path=DEEPSEEK_V4_PATH,
-    global_batch_size=16,
+    global_batch_size=128,
     work_dir="/mnt/shared-storage-user/yehaochen/tmp",
     seed=0,
     strict_load=False,