update moe_smooth_per_token_scaled_quant dispatch and v2 supports block_m is a multiple of 16 (#2333)

junhaha666 · web-flow · commit 3fb9c713e7b9 · 2026-03-19T14:20:49.000+08:00
diff --git a/aiter/fused_moe_bf16_asm.py b/aiter/fused_moe_bf16_asm.py
@@ -198,35 +198,6 @@ def asm_moe(
             # aiter.moe_smoothquant_fwd(
             #     a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale
             # )
-            # aiter.smooth_per_token_scaled_quant(
-            #     a8.view(topk, M, model_dim).transpose(0, 1),
-            #     hidden_states.view(M, 1, model_dim).expand(-1, topk, -1),
-            #     a8_scale,
-            #     fc1_smooth_scale,
-            #     topk_ids,
-            #     smooth_scale_map_hash=local_expert_hash,
-            #     enable_ps=True,
-            # )
-            # aiter.moe_smooth_per_token_scaled_quant_v1(
-            #     a8,
-            #     hidden_states,
-            #     a8_scale,
-            #     fc1_smooth_scale,
-            #     topk_ids,
-            #     smooth_scale_map_hash=local_expert_hash,
-            #     transpose_out=True,
-            # )
-            # aiter.moe_smooth_per_token_scaled_quant_v2(
-            #     a8,
-            #     hidden_states,
-            #     a8_scale,
-            #     fc1_smooth_scale,
-            #     sorted_ids,
-            #     sorted_expert_ids,
-            #     num_valid_ids,
-            #     BLOCK_SIZE_M,
-            #     transpose_out=True,
-            # )
             aiter.moe_smooth_per_token_scaled_quant(
                 a8,
                 hidden_states,
diff --git a/aiter/ops/quant.py b/aiter/ops/quant.py
@@ -415,21 +415,35 @@ def moe_smooth_per_token_scaled_quant(
     local_expert_hash: Optional[torch.Tensor] = None,
     shuffle_scale: bool = False,
     transpose_out: bool = False,
+    is_balanced: bool = False,
 ) -> None:
     cu_num = get_cu_num()
     is_moe_stage1 = input.numel() != out.numel()
-    token_num = input.shape[0]
-    if is_moe_stage1 and local_expert_hash is not None and token_num < cu_num * 8:
-        moe_smooth_per_token_scaled_quant_v1(
-            out,
-            input,
-            scales,
-            smooth_scale,
-            topk_ids,
-            shuffle_scale,
-            local_expert_hash,
-            transpose_out,
-        )
+    M = input.shape[0]
+    if is_moe_stage1 and local_expert_hash is not None and M < cu_num * 8:
+        if is_balanced:
+            moe_smooth_per_token_scaled_quant_v1(
+                out,
+                input,
+                scales,
+                smooth_scale,
+                topk_ids,
+                shuffle_scale,
+                local_expert_hash,
+                transpose_out,
+            )
+        else:
+            topk = topk_ids.shape[1]
+            model_dim = input.shape[-1]
+            smooth_per_token_scaled_quant(
+                out.view(topk, M, model_dim).transpose(0, 1),
+                input.view(M, 1, model_dim).expand(-1, topk, -1),
+                scales,
+                smooth_scale,
+                topk_ids,
+                smooth_scale_map_hash=local_expert_hash,
+                enable_ps=True,
+            )
     else:
         moe_smooth_per_token_scaled_quant_v2(
             out,
diff --git a/csrc/kernels/quant_kernels.cu b/csrc/kernels/quant_kernels.cu
@@ -1515,33 +1515,33 @@ __global__ void moe_smooth_per_token_scaled_quant_kernel_v2(DTYPE_O* __restrict_
 }
 
 
-#define MOE_SMOOTH_PER_TOKEN_SCALED_QUANT_KERNEL_V2_IMPL(quant_kernel, DTYPE_O, THREAD_DATA, BLOCK_SIZE) \
-    AITER_DISPATCH_FLOATING16_TYPES(input.scalar_type(), "quant_kernel", [&] {                                         \
-        using input_dtype = typename t2ck<scalar_t>::type;                                                             \
-        int warps_per_cu = 8 * BLOCK_SIZE / WARP_SIZE; \
-        int num_tg = persistent_mode? num_cu * warps_per_cu : num_blocks; \
-        dim3 const grid(num_tg);                                                                                    \
-        aiter::quant_kernel<input_dtype, DTYPE_O, BLOCK_SIZE, THREAD_DATA> \
-            <<<grid, dim3(BLOCK_SIZE), 0, stream>>>(                                                                   \
-                reinterpret_cast<DTYPE_O*>(out.data_ptr()),                                                            \
-                scales.data_ptr<float>(),                                                                              \
-                reinterpret_cast<input_dtype*>(input.data_ptr()),                                                      \
-                smooth_scale.data_ptr<float>(),                                                                        \
-                sorted_token_ids.data_ptr<int>(),                                                                      \
-                sorted_expert_ids.data_ptr<int>(),                                                                     \
-                num_valid_ids.data_ptr<int>(),                                                                      \
-                num_experts,                                                                                              \
-                num_tokens,                                                                                              \
-                num_blocks,                                                                                              \
-                num_tg,                                                                                                  \
-                cols,                                                                                                  \
-                topk,                                                                                                  \
-                block_m,                                                                                               \
-                block_m_log2split,                                                                                   \
-                input_stride0,                                                                                   \
-                input_stride1,                                                                                   \
-                shuffle_scale,                                                                                         \
-                transpose_out);                                                                                  \
+#define MOE_SMOOTH_PER_TOKEN_SCALED_QUANT_KERNEL_V2_IMPL(quant_kernel, DTYPE_O, THREAD_DATA, BLOCK_SIZE)  \
+    AITER_DISPATCH_FLOATING16_TYPES(input.scalar_type(), "quant_kernel", [&] {                            \
+        using input_dtype = typename t2ck<scalar_t>::type;                                                \
+        int warps_per_cu = 8 * BLOCK_SIZE / WARP_SIZE;                                                    \
+        int num_tg = persistent_mode? num_cu * warps_per_cu : num_blocks;                                 \
+        dim3 const grid(num_tg);                                                                          \
+        aiter::quant_kernel<input_dtype, DTYPE_O, BLOCK_SIZE, THREAD_DATA>                                \
+            <<<grid, dim3(BLOCK_SIZE), 0, stream>>>(                                                      \
+                reinterpret_cast<DTYPE_O*>(out.data_ptr()),                                               \
+                scales.data_ptr<float>(),                                                                 \
+                reinterpret_cast<input_dtype*>(input.data_ptr()),                                         \
+                smooth_scale.data_ptr<float>(),                                                           \
+                sorted_token_ids.data_ptr<int>(),                                                         \
+                sorted_expert_ids.data_ptr<int>(),                                                        \
+                num_valid_ids.data_ptr<int>(),                                                            \
+                num_experts,                                                                              \
+                num_tokens,                                                                               \
+                num_blocks,                                                                               \
+                num_tg,                                                                                   \
+                cols,                                                                                     \
+                topk,                                                                                     \
+                block_m,                                                                                  \
+                block_m_log2split,                                                                        \
+                input_stride0,                                                                            \
+                input_stride1,                                                                            \
+                shuffle_scale,                                                                            \
+                transpose_out);                                                                           \
     });
 
 
@@ -1589,10 +1589,11 @@ void moe_smooth_per_token_scaled_quant_v2(
     int input_stride1= input.dim() == 2 ? 0 : input.stride(1);
 
     const int num_cu = get_num_cu_func();
-    int sub_block_m = 2;
-    int num_blocks = sorted_expert_ids.size(0) * (block_m / sub_block_m);
-    int block_split = block_m / sub_block_m;
+    int block_split = 16;
     int block_m_log2split = log2(block_split);
+    TORCH_CHECK(block_m % block_split == 0, __func__, " block_m is not divisible by block_split");
+    int sub_block_m = block_m >> block_m_log2split;
+    int num_blocks = sorted_expert_ids.size(0) * block_split;
     const bool persistent_mode = true;
 
     const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(input));