chore: clarify padding-index clamp comment and clean up stale TODO

kaixih · claude · kaixih · commit b6c0d39c46e6 · 2026-04-17T17:59:56.000Z
Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/python/sglang/srt/layers/attention/linear/kernels/gdn_flashinfer.py b/python/sglang/srt/layers/attention/linear/kernels/gdn_flashinfer.py
@@ -200,8 +200,9 @@ def extend(
         beta_fi = beta[0].to(torch.float32)
 
         if self.is_sm100plus:
-            # SM100+: slot 0 is reserved as dummy/scratch (never assigned to real
-            # sequences), so clamp(-1 → 0).
+            # Negative indices (e.g. -1) are padding markers for slots not yet
+            # assigned to a real sequence; clamp them to 0 (the reserved dummy
+            # slot) so the FlashInfer kernel never reads out-of-bounds state.
             ssm_cache_indices = cache_indices.clamp(min=0).to(torch.int64)
             num_seqs = ssm_cache_indices.shape[0]
             num_sab_heads = max(q.shape[2], num_v_heads)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -2722,7 +2722,8 @@ def _handle_linear_attn_backend(self):
                 f"got {self.mamba_ssm_dtype!r}"
             )
 
-        # SM100+ FlashInfer GDN prefill requires CUDA 13+ (CuTe DSL kernel).
+        # SM100+ FlashInfer GDN prefill requires CUDA 13+ (CuTe DSL kernel)
+        # for correctness and best performance.
         prefill = self.linear_attn_prefill_backend or self.linear_attn_backend
         if (
             prefill == "flashinfer"
diff --git a/test/registered/4-gpu-models/test_qwen35_fp4_triton.py b/test/registered/4-gpu-models/test_qwen35_fp4_triton.py
@@ -48,12 +48,6 @@ def test_gsm8k(self):
                 extra_args=base_args,
                 variant="Triton",
             ),
-            # TODO: Fix this and re-enable it
-            # ModelLaunchSettings(
-            #     QWEN35_FP4_MODEL,
-            #     extra_args=base_args + ["--linear-attn-decode-backend", "flashinfer"],
-            #     variant="FlashInfer",
-            # ),
         ]
 
         run_combined_tests(

Original file line number	Diff line number	Diff line change
`@@ -2722,7 +2722,8 @@ def _handle_linear_attn_backend(self):`
`2722`	`2722`	`f"got {self.mamba_ssm_dtype!r}"`
`2723`	`2723`	`)`
`2724`	`2724`
`2725`		`- # SM100+ FlashInfer GDN prefill requires CUDA 13+ (CuTe DSL kernel).`
	`2725`	`+ # SM100+ FlashInfer GDN prefill requires CUDA 13+ (CuTe DSL kernel)`
	`2726`	`+ # for correctness and best performance.`
`2726`	`2727`	`prefill = self.linear_attn_prefill_backend or self.linear_attn_backend`
`2727`	`2728`	`if (`
`2728`	`2729`	`prefill == "flashinfer"`