Skip to content

Commit b6c0d39

Browse files
kaixihclaude
andcommitted
chore: clarify padding-index clamp comment and clean up stale TODO
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent e843e17 commit b6c0d39

3 files changed

Lines changed: 5 additions & 9 deletions

File tree

python/sglang/srt/layers/attention/linear/kernels/gdn_flashinfer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,9 @@ def extend(
200200
beta_fi = beta[0].to(torch.float32)
201201

202202
if self.is_sm100plus:
203-
# SM100+: slot 0 is reserved as dummy/scratch (never assigned to real
204-
# sequences), so clamp(-1 → 0).
203+
# Negative indices (e.g. -1) are padding markers for slots not yet
204+
# assigned to a real sequence; clamp them to 0 (the reserved dummy
205+
# slot) so the FlashInfer kernel never reads out-of-bounds state.
205206
ssm_cache_indices = cache_indices.clamp(min=0).to(torch.int64)
206207
num_seqs = ssm_cache_indices.shape[0]
207208
num_sab_heads = max(q.shape[2], num_v_heads)

python/sglang/srt/server_args.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2722,7 +2722,8 @@ def _handle_linear_attn_backend(self):
27222722
f"got {self.mamba_ssm_dtype!r}"
27232723
)
27242724

2725-
# SM100+ FlashInfer GDN prefill requires CUDA 13+ (CuTe DSL kernel).
2725+
# SM100+ FlashInfer GDN prefill requires CUDA 13+ (CuTe DSL kernel)
2726+
# for correctness and best performance.
27262727
prefill = self.linear_attn_prefill_backend or self.linear_attn_backend
27272728
if (
27282729
prefill == "flashinfer"

test/registered/4-gpu-models/test_qwen35_fp4_triton.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,6 @@ def test_gsm8k(self):
4848
extra_args=base_args,
4949
variant="Triton",
5050
),
51-
# TODO: Fix this and re-enable it
52-
# ModelLaunchSettings(
53-
# QWEN35_FP4_MODEL,
54-
# extra_args=base_args + ["--linear-attn-decode-backend", "flashinfer"],
55-
# variant="FlashInfer",
56-
# ),
5751
]
5852

5953
run_combined_tests(

0 commit comments

Comments
 (0)