Fix zero token issue (#1895)

CyCle1024 · web-flow · commit b4d21bcdf092 · 2026-06-09T17:18:38.000+08:00
fix zero token
diff --git a/xtuner/v1/float8/float8_gmm_tile_wise.py b/xtuner/v1/float8/float8_gmm_tile_wise.py
@@ -88,6 +88,13 @@ class fp8_gmm_weight_per_block_act_per_tile(torch.autograd.Function):
     def forward(ctx, x, w_fp8, tokens_per_expert):
         seq, din = x.shape
         ne, dout, din = w_fp8.shape
+        ctx.zero_token_dispatch = seq == 0
+        ctx.input_shape = x.shape
+        ctx.weight_shape = w_fp8.shape
+
+        if ctx.zero_token_dispatch:
+            return x.new_empty((seq, dout))
+
         x_fp8, x_scale = per_tile_quant(x)
         (
             x_trans_quant_fp8,
@@ -104,6 +111,11 @@ def forward(ctx, x, w_fp8, tokens_per_expert):
 
     @staticmethod
     def backward(ctx, grad_output_hp):
+        if ctx.zero_token_dispatch:
+            dx = grad_output_hp.new_empty(ctx.input_shape)
+            dw = grad_output_hp.new_zeros(ctx.weight_shape)
+            return dx, dw, None
+
         (
             x_trans_quant_fp8,
             x_trans_quant_scale,
@@ -278,9 +290,10 @@ def forward(self, input: torch.Tensor, tokens_per_expert, decoding: bool = False
             weight_fp8 = weight_to_per_block_float8_dynamic.apply(weight, torch.float8_e4m3fn, 128)
 
         orig_shape = input.shape
-        input = input.view(-1, input.shape[-1])
+        num_tokens = input.numel() // input.shape[-1]
+        input = input.view(num_tokens, input.shape[-1])
         out = fp8_gmm_weight_per_block_act_per_tile.apply(input, weight_fp8, tokens_per_expert)
-        out = out.view(*orig_shape[:-1], -1)
+        out = out.view(*orig_shape[:-1], self.out_features)
         return out
 
     @property
diff --git a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
@@ -184,10 +184,6 @@ def __init__(
         self.moe_act = moe_act_fn_cfg.build()
 
     def forward(self, x, tokens_per_expert, decoding):
-        # short cut for dispatching 0 token in ep_size >1 case
-        if x.numel() == 0:
-            return x
-
         gate_up_out = self.fused_w1w3(x, tokens_per_expert, decoding)
         out = self.moe_act(gate_up_out, split_dim=-1)
         res = self.fused_w2(out, tokens_per_expert, decoding)