File tree Expand file tree Collapse file tree
flashinfer/fused_moe/cute_dsl Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -485,10 +485,16 @@ def _forward_with_tactic(
485485 ** kwargs ,
486486 ) -> torch .Tensor :
487487 """Forward implementation called by auto-tuner."""
488- # Pre-allocated buffers are sized for self.tile_size. When the tactic
489- # uses a different tile_size (e.g. during autotune), fall back to
490- # dynamic allocation to avoid buffer overflow in moe_sort.
491- use_prealloc = self .use_cuda_graph and tile_size == self .tile_size
488+ # Pre-allocated buffers are sized for self.max_num_tokens and
489+ # self.tile_size. When the tactic uses a different tile_size or the
490+ # autotuner profiles a larger batch, fall back to dynamic allocation
491+ # to avoid buffer overflow.
492+ num_tokens = x .shape [0 ]
493+ use_prealloc = (
494+ self .use_cuda_graph
495+ and tile_size == self .tile_size
496+ and num_tokens <= self .max_num_tokens
497+ )
492498 return _moe_core_impl (
493499 x = x ,
494500 x_sf = x_sf ,
You can’t perform that action at this time.
0 commit comments