[Feature] V4 KVCompressor: compressed-kv RoPE + per-ratio cu_seq_lens_out reuse

HAOCHENYE · HAOCHENYE · commit 8a0efa82c831 · 2026-06-27T05:23:55.000Z
Two changes that co-evolve the KVCompressor.forward signature (compressed-rope
table + precomputed boundaries are added side by side), so they land together.

1. Compressed-kv RoPE. After the chunk softmax + norm, rotate each compressed
   chunk's rope tail at its window-center position, mirroring HF
   DeepseekV4{CSA,HCA}Compressor.forward. ``qk_rope_head_dim`` is wired from the
   DSA/Indexer configs into the internal KVCompressor, and DSA now forwards
   ``position_embeddings_compressed`` to the compressor (required for
   compress_ratio &gt; 0, not just == 4). The chunk-&gt;sample map uses
   ``searchsorted(cu_seq_lens_out, ., right=True) - 1`` — right=True is
   load-bearing: a chunk on a sample boundary is the first chunk of the next
   sample, and mapping it to the previous one overruns ``first_token_per_chunk``
   and indexes the rope table out of bounds.

2. Hoist cu_seq_lens_out. ``KVCompressor.build_cu_seq_lens_out`` computes the
   per-sample compressed boundaries once; DeepSeekV4 forward builds one per
   distinct compress_ratio and caches it on
   ``SequenceContext.compressed_cu_seq_lens``, so every decoder layer of that
   ratio reuses a single cumsum + H2D instead of recomputing it. ``total_c``
   stays derived in the compressor from the CPU mirror (it must remain a Python
   int and would force a recompile if threaded through the compiled attn graph).
   Standalone callers (no cache on seq_ctx) fall back to building it in-place.
diff --git a/xtuner/v1/data_proto/sequence_context.py b/xtuner/v1/data_proto/sequence_context.py
@@ -35,6 +35,13 @@ class SequenceContext:
     # consumers must tolerate fallback to a GPU ``.item()`` path in that case.
     cu_seq_lens_q_cpu: torch.Tensor | None
     cu_seq_lens_k_cpu: torch.Tensor | None
+    # Optional per-``compress_ratio`` cache of compressed-chunk cumulative boundaries
+    # (``{ratio: cu_seq_lens_out}``), populated by chunk-compression models (DeepSeek-V4)
+    # at the start of forward via ``KVCompressor.build_cu_seq_lens_out`` so every decoder
+    # layer of a given ratio reuses one cumsum + H2D instead of recomputing it. ``None`` for
+    # models that don't compress; not a constructor argument (set post-construction, like
+    # ``seq_idx``) so it stays out of the generic SequenceContext contract.
+    compressed_cu_seq_lens: dict[int, torch.Tensor] | None
     max_length_q: torch.Tensor
     max_length_k: torch.Tensor
     num_padding: int
@@ -130,6 +137,9 @@ def __init__(
         self._shard_start = shard_start
         self._shard_size = shard_size
         self.seq_idx = None
+        # Populated lazily by the model forward (chunk-compression models only); see the
+        # field declaration above.
+        self.compressed_cu_seq_lens = None
 
         # `DeviceMesh.get_local_rank` is not compatible with `torch.compile`, we calculate `_sp_rank` in
         # `SequenceContext`
diff --git a/xtuner/v1/model/moe/deepseek_v4.py b/xtuner/v1/model/moe/deepseek_v4.py
@@ -31,6 +31,7 @@
 from transformers import AutoConfig
 from xtuner.v1.module import HashRouterConfig, NoAuxRouterConfig
 from xtuner.v1.module.attention.dsa import DSAConfig
+from xtuner.v1.module.attention.kv_compressor import KVCompressor
 from xtuner.v1.module.decoder_layer.deepseek_v4_decoder_layer import V4DecoderLayer
 from xtuner.v1.module.decoder_layer.hc_block import HCWrapperConfig
 from xtuner.v1.module.decoder_layer.moe_decoder_layer import (
@@ -506,6 +507,10 @@ def build_layers(self, config: MoEConfig) -> nn.ModuleDict:
             f"compress_ratios (len={len(compress_ratios) if compress_ratios else 0}) must cover "
             f"all {v4_cfg.num_hidden_layers} hidden layers"
         )
+        # Distinct positive compress_ratios across the stack. The model forward builds one
+        # ``cu_seq_lens_out`` per ratio and caches it on the SequenceContext, so every layer of
+        # that ratio reuses the cumsum + H2D instead of recomputing it inside its KVCompressor.
+        self._compressor_ratios = sorted({r for r in compress_ratios[: v4_cfg.num_hidden_layers] if r > 0})
 
         layers = nn.ModuleDict()
         for layer_idx in range(v4_cfg.num_hidden_layers):
@@ -604,10 +609,23 @@ def _should_compute_aux_loss(self, layer_idx: int) -> bool:
     # automatically — V4's mtp_block is None (build_mtp_block returns None) — so the
     # parent's MTP branch is a no-op (PR9 follow-up wires the V4-specific MTP head).
 
+    def _assign_compressed_cu_seq_lens(self, seq_ctx) -> None:
+        # Build ``cu_seq_lens_out`` once per distinct compress_ratio and cache it on the
+        # SequenceContext, so the per-layer KVCompressor (DSA + Indexer) reuses it instead of
+        # re-running the cumsum + H2D every call. Keyed by ratio because the chunk count is
+        # ``ceil(L_i / ratio)`` — different for the ratio-4 and ratio-128 layers.
+        if not self._compressor_ratios:
+            return
+        seq_ctx.compressed_cu_seq_lens = {
+            ratio: KVCompressor.build_cu_seq_lens_out(seq_ctx.cu_seq_lens_q, seq_ctx.cu_seq_lens_q_cpu, ratio)[0]
+            for ratio in self._compressor_ratios
+        }
+
     @override
     def _prepare_hidden_states(self, seq_ctx) -> tuple[torch.Tensor, dict]:  # type: ignore[override]
         assert seq_ctx.position_ids is not None
         assert seq_ctx.input_ids is not None, "DeepSeekV4 requires input_ids (HashRouter consumes them)"
+        self._assign_compressed_cu_seq_lens(seq_ctx)
         hidden_states = self.embed_tokens(seq_ctx.input_ids)
         # Dense rope (sliding-window heads) and compressed rope (Indexer) both come
         # from the same DualRotaryEmbedding; precompute both so each layer picks the
@@ -676,6 +694,7 @@ def _prepare_hidden_states_mb(self, seq_ctx_list) -> tuple[list[torch.Tensor], d
         for seq_ctx in seq_ctx_list:
             assert seq_ctx.position_ids is not None
             assert seq_ctx.input_ids is not None, "DeepSeekV4 requires input_ids (HashRouter consumes them)"
+            self._assign_compressed_cu_seq_lens(seq_ctx)
             h = self.embed_tokens(seq_ctx.input_ids)
             pos_emb = self.rotary_emb(h, seq_ctx.position_ids, use_compressed=False)
             pos_emb_compressed = _build_compressed_position_embeddings(self.rotary_emb, h, seq_ctx.position_ids)
diff --git a/xtuner/v1/module/attention/dsa.py b/xtuner/v1/module/attention/dsa.py
@@ -277,6 +277,7 @@ def __init__(
                 compress_ratio=compress_ratio,
                 overlap=(compress_ratio == 4),
                 rotate=False,
+                qk_rope_head_dim=dsa_cfg.qk_rope_head_dim,
                 rms_norm_eps=dsa_cfg.rms_norm_eps,
             )
         else:
@@ -461,8 +462,13 @@ def forward(
             )
         if hidden_states.size(-1) != self.hidden_size:
             raise ValueError(f"hidden_states last dim {hidden_states.size(-1)} != hidden_size {self.hidden_size}")
-        if self.compress_ratio == 4 and position_embeddings_compressed is None:
-            raise ValueError("position_embeddings_compressed is required for compress_ratio == 4 (Indexer rope)")
+        if self.compress_ratio > 0 and position_embeddings_compressed is None:
+            raise ValueError(
+                "position_embeddings_compressed is required for compress_ratio > 0 "
+                "(the KVCompressor rotates compressed-kv with the compressed-rope basis to "
+                "match V4 reference Compressor.forward, mirroring HF "
+                "DeepseekV4{CSA,HCA}Compressor)"
+            )
 
         cos, sin = position_embeddings
         total_tokens = hidden_states.size(1)
@@ -534,10 +540,22 @@ def forward(
             # so, the single outer call eliminates N entry/exit pairs and
             # batches the GEMMs at the layer boundary.
             assert self.compressor is not None  # compress_ratio > 0 always materialises it
+            # Compressed boundaries are built once per compress_ratio at model forward and cached
+            # on seq_ctx (DeepSeekV4._assign_compressed_cu_seq_lens); reuse this layer's instead of
+            # recomputing the cumsum + H2D. ``None`` (e.g. standalone DSA in a unit test) falls back
+            # to the compressor building it. The Indexer's internal compressor shares this ratio's
+            # value. ``.get`` over the constant ``self.compress_ratio`` key stays compile-traceable.
+            cu_seq_lens_out = (
+                seq_ctx.compressed_cu_seq_lens.get(self.compress_ratio)
+                if seq_ctx.compressed_cu_seq_lens is not None
+                else None
+            )
             kv_compressed, cu_c = self.compressor(
                 hidden_states,
                 cu_q,
                 cu_seq_lens_cpu=seq_ctx.cu_seq_lens_q_cpu,
+                position_embeddings_compressed=position_embeddings_compressed,
+                cu_seq_lens_out=cu_seq_lens_out,
             )  # [1, total_c, D], [B+1]
             if self.compress_ratio == 4:
                 # ``DualRotaryEmbedding`` already emits half-dim cos/sin in the
@@ -564,6 +582,7 @@ def forward(
                         (cos_c, sin_c),
                         cu_q,
                         cu_seq_lens_cpu=seq_ctx.cu_seq_lens_q_cpu,
+                        cu_seq_lens_out=cu_seq_lens_out,
                     )
             else:
                 # compress_ratio == 128: deterministic positional top-k.
diff --git a/xtuner/v1/module/attention/indexer.py b/xtuner/v1/module/attention/indexer.py
@@ -181,6 +181,12 @@ def __init__(self, config: IndexerConfig) -> None:
             compress_ratio=config.compress_ratio,
             overlap=True,
             rotate=True,
+            # Wire the same ``rope_head_dim`` the Indexer uses for its q rope
+            # tail through to the internal KVCompressor: HF
+            # ``DeepseekV4Indexer.forward`` rotates its ``compressed`` output at
+            # window-center positions before scoring (modeling_deepseek_v4.py L541),
+            # so the internal compressor must do the same.
+            qk_rope_head_dim=config.rope_head_dim,
             rms_norm_eps=config.rms_norm_eps,
         )
 
@@ -191,6 +197,7 @@ def forward(
         position_embeddings_compressed: tuple[torch.Tensor, torch.Tensor],
         cu_seq_lens: torch.Tensor,
         cu_seq_lens_cpu: torch.Tensor | None = None,
+        cu_seq_lens_out: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Compute per-query top-k compressed-KV indices.
 
@@ -206,6 +213,12 @@ def forward(
                 DSA layer's dual-rope module.
             cu_seq_lens (torch.Tensor): 1D int32 cumulative per-sample token
                 counts with length ``num_samples + 1``.
+            cu_seq_lens_cpu (torch.Tensor | None): Optional CPU mirror of
+                ``cu_seq_lens`` forwarded to the internal compressor.
+            cu_seq_lens_out (torch.Tensor | None): Optional precomputed compressed
+                boundaries for this ``compress_ratio`` (built once at model
+                forward); forwarded to the internal compressor so it skips the
+                per-call cumsum + H2D. See :meth:`KVCompressor.build_cu_seq_lens_out`.
 
         Returns:
             torch.Tensor: Top-k indices shaped ``[1, total_tokens, index_topk]``
@@ -259,6 +272,8 @@ def forward(
             hidden_states,
             cu_seq_lens,
             cu_seq_lens_cpu=cu_seq_lens_cpu,
+            position_embeddings_compressed=position_embeddings_compressed,
+            cu_seq_lens_out=cu_seq_lens_out,
         )
 
         # Step 5: gate weights, scaled exactly as V4 reference L418.
diff --git a/xtuner/v1/module/attention/kv_compressor.py b/xtuner/v1/module/attention/kv_compressor.py