[Fmha] Add head_dim=512 support for trtllm attention kernels (#2959)

djmmoss · web-flow · commit 9f7adfb8461d · 2026-04-22T23:04:38.000Z
Add support for `head_dim=512` in the trtllm FMHA kernel selection.

### Changes

- Add SDPA-based reference implementation for `head_dim &gt; 256` in tests
(FlashInfer FA2/FA3 kernels don't support `head_dim &gt; 256`)
- Add `test_trtllm_batch_prefill_head_dim_512` and
`test_trtllm_batch_decode_head_dim_512` covering BF16, FP16, and FP8
dtypes

### Follow-up

- NVFP4 coverage at `head_dim=512` is deferred to a follow-up PR. 

Signed-off-by: Duncan Moss &lt;djm.moss@gmail.com&gt;
diff --git a/csrc/fmhaReduction.cu b/csrc/fmhaReduction.cu
@@ -81,7 +81,7 @@ __global__ void __launch_bounds__(NumThreadsPerCta, 2)
   seqLenKv = seqLenKv - ((params.mMaxSeqLenQ - 1) - ctaIdxQ);
   // Consider sparseMlaTopK.
   if (sparseMla) {
-    seqLenKv = min(seqLenKv, params.mSparseMlaTopK);
+    seqLenKv = min(seqLenKv, params.mSparseAttnTopK);
   }
   // The actual number of CtasKv (TileSizeKv is always 128 for now).
   int32_t numCtasKv{min((seqLenKv + 127) / 128, params.mMaxNumCtasKv)};
@@ -361,7 +361,7 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams
   }
 
   // Launch the kernel.
-  cudaLaunchKernelEx(&config, kernel, params, kernelMeta.mSparseMla, numCtasForReduction,
+  cudaLaunchKernelEx(&config, kernel, params, kernelMeta.mSparseAttn != 0, numCtasForReduction,
                      numCtasForAllHeads, numHeadDimCtasV);
   cudaError_t err = cudaGetLastError();
   FLASHINFER_CHECK(err == cudaSuccess, "Failed to launch kernel: ", cudaGetErrorString(err));
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -135,7 +135,7 @@ class ArtifactPath:
     When compiling new cubins for backend directories, update the corresponding path.
     """
 
-    TRTLLM_GEN_FMHA: str = "55bba55929d4093682e32d817bd11ffb0441c749/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "82f4c77d9cf83e3fcf105feda4ce3445100ab491/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
         "39a9d28268f43475a757d5700af135e1e58c9849/batched_gemm-5ee61af-2b9855b/"
     )
@@ -155,7 +155,7 @@ class CheckSumHash:
     """
 
     TRTLLM_GEN_FMHA: str = (
-        "f2c0aad1e74391c4267a2f9a20ec819358b59e04588385cffb452ed341500b99"
+        "56c95fbe5d1b5d0d9ded7706e1c0b7ebf0582d9cfd2f9382acd878b6b9d58c89"
     )
     TRTLLM_GEN_BMM: str = (
         "db06db7f36a2a9395a2041ff6ac016fe664874074413a2ed90797f91ef17e0f6"
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -191,7 +191,7 @@ class TllmGenFmhaKernel {
                   kernelMeta.mTileScheduler, kernelMeta.mMultiCtasKvMode,
                   kernelMeta.mHeadDimPerCtaV, kernelMeta.mHeadDimQk, kernelMeta.mHeadDimV,
                   kernelMeta.mTileSizeQ, kernelMeta.mTileSizeKv, kernelMeta.mNumTokensPerPage,
-                  kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseMla,
+                  kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseAttn != 0,
                   kernelMeta.mSkipsSoftmaxWhenPossible);
   }
 
@@ -789,6 +789,14 @@ class TllmGenFmhaKernel {
       selectGqGenerationKernel(params, selectKernelParams);
     }
 
+    // For headDimV > 256, set headDimPerCtaV to 256 for context and keepsMmaAbForGeneration
+    // kernels. swapsMmaAbForGeneration has enough TMEM resources to hold the full headDimV.
+    // Called for context and GQA generation; MLA sets headDimPerCtaV separately.
+    if (params.mHeadDimV > 256 && !isMlaGenKernel(params) &&
+        !isSwapsMmaAbForGenerationKernel(selectKernelParams.mKernelType)) {
+      selectKernelParams.mHeadDimPerCtaV = 256;
+    }
+
     // Enable sliding window or chunked causal if the max kv sequence length exceeds attention
     // window size or chunked attention size. This is supported by causal-mask context kernels and
     // generation-phase kernels.
diff --git a/include/flashinfer/trtllm/fmha/kernelParams.h b/include/flashinfer/trtllm/fmha/kernelParams.h
@@ -169,13 +169,14 @@ struct KernelParams {
   float mScaleSfO;
   // Threshold to decide whether warp skips softmax ops
   float mSkipSoftmaxThresholdScaleFactor;
+  // The sparse attention topK value. Must immediately follow mSkipSoftmaxThresholdScaleFactor
+  // to match the GPU struct layout expected by trtllm-gen kernels (changed in a339772b).
+  int32_t mSparseAttnTopK;
   // The start token index in SF tensor. Used for FP4 SF offset calculation in generation phase
   // kernel when inflight batching is enabled in TRT-LLM.
   int32_t mStartTokenIdxSfO;
   // The sum of sequence lengths for Q and K/V.
   int32_t mSumOfSeqLensQ, mSumOfSeqLensKv;
-  // The sparseMla topK value.
-  int32_t mSparseMlaTopK;
   // The flag to use block sparse attention.
   bool mUseBlockSparseAttention;
   // Whether the indices for K & V pages are shared as unified index.
@@ -854,7 +855,7 @@ struct KernelParams {
     // indices.
     FLASHINFER_CHECK(!options.mSparseMla || (options.mSparseMlaTopK % 4) == 0,
                      "SparseMlaTopK must be a multiple of 4");
-    params.mSparseMlaTopK = options.mSparseMlaTopK;
+    params.mSparseAttnTopK = options.mSparseMlaTopK;
     // TODO: Integrate trtllm block-sparse attention kernels when needed.
     params.mUseBlockSparseAttention = false;
     // Whether the indices for K & V pages are shared as unified index (vLLM/FlashInfer).
diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py