Skip to content

Commit c0b80b6

Browse files
committed
update not to prealloc buffer when input token larger than max token
1 parent a550764 commit c0b80b6

1 file changed

Lines changed: 10 additions & 4 deletions

File tree

flashinfer/fused_moe/cute_dsl/fused_moe.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -485,10 +485,16 @@ def _forward_with_tactic(
485485
**kwargs,
486486
) -> torch.Tensor:
487487
"""Forward implementation called by auto-tuner."""
488-
# Pre-allocated buffers are sized for self.tile_size. When the tactic
489-
# uses a different tile_size (e.g. during autotune), fall back to
490-
# dynamic allocation to avoid buffer overflow in moe_sort.
491-
use_prealloc = self.use_cuda_graph and tile_size == self.tile_size
488+
# Pre-allocated buffers are sized for self.max_num_tokens and
489+
# self.tile_size. When the tactic uses a different tile_size or the
490+
# autotuner profiles a larger batch, fall back to dynamic allocation
491+
# to avoid buffer overflow.
492+
num_tokens = x.shape[0]
493+
use_prealloc = (
494+
self.use_cuda_graph
495+
and tile_size == self.tile_size
496+
and num_tokens <= self.max_num_tokens
497+
)
492498
return _moe_core_impl(
493499
x=x,
494500
x_sf=x_sf,

0 commit comments

Comments
 (0)