Default to FlashInfer GDN decode on SM100+ with bf16 mamba state

YAMY1234 · YAMY1234 · commit c92b80d4ddc2 · 2026-04-01T10:12:29.000-07:00
On SM100+ with mamba-ssm-dtype=bfloat16, automatically set --linear-attn-decode-backend to flashinfer when not explicitly specified. This gives 1-5% TPOT improvement at higher concurrencies. The prerequisite bug (OOB from negative padding indices in bf16 decode kernel) was fixed in FlashInfer v0.6.7 via flashinfer-ai/flashinfer#2810. Verified on Qwen3.5-397B-A17B-NVFP4 (4xGB200, no_buffer + disable-radix-cache), sa-bench ISL=1024 OSL=1024, conc 2-1024: - GSM8K accuracy: 0.977-0.979 - Mean TPOT: -1.3% (conc=2) to -4.5% (conc=1024) - Excluded when MTP speculative decoding is active (not yet supported) - Output throughput: +1.3% (conc=2) to +4.7% (conc=1024)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -2557,9 +2557,27 @@ def _handle_mamba_backend(self):
                 )
 
     def _handle_linear_attn_backend(self):
-        # SM100+ FlashInfer GDN decode requires bf16 state; SM90 uses float32.
         import torch
 
+        # SM100+: default to FlashInfer GDN decode when the user hasn't
+        # explicitly chosen a decode backend and mamba-ssm-dtype is bf16
+        # (required by FlashInfer GDN on SM100+).
+        # Fixed in FlashInfer v0.6.7: flashinfer-ai/flashinfer#2810
+        # Excluded when MTP speculative decoding is enabled because
+        # FlashInfer GDN MTP verify is not yet supported on SM100+.
+        if (
+            self.linear_attn_decode_backend is None
+            and is_sm100_supported()
+            and self.mamba_ssm_dtype == "bfloat16"
+            and self.speculative_algorithm is None
+        ):
+            self.linear_attn_decode_backend = "flashinfer"
+            logger.info(
+                "SM100+ detected with mamba-ssm-dtype=bfloat16, "
+                "defaulting --linear-attn-decode-backend to flashinfer."
+            )
+
+        # SM100+ FlashInfer GDN decode requires bf16 state; SM90 uses float32.
         decode = self.linear_attn_decode_backend or self.linear_attn_backend
         if (
             decode == "flashinfer"