fix: fix illegal cuda memory access at fused_moe_kernel (sgl-project#4727)

saltyfish66 · yuethe · thyecust · commit 2f0bc461d130 · 2025-04-11T17:06:07.000+08:00
Co-authored-by: yuethe &lt;yuethe@tencent.com&gt;
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -152,6 +152,7 @@ def fused_moe_kernel(
         return
     offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    offs_token = offs_token.to(tl.int64)
     token_mask = offs_token < num_valid_tokens
 
     offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N