PaddlePaddle · Jiang-Jia-Jun · Oct 9, 2025 · Aug 20, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
@@ -1018,7 +1018,7 @@ void MultiQueryAppendAttention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
+        dim3 grids_merge(sm_count * 4,
                          num_heads);  // 128k is too large
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
@@ -1268,7 +1268,7 @@ void MultiQueryAppendAttention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
+        dim3 grids_merge(sm_count * 4,
                           num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,

diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
@@ -1229,7 +1229,7 @@ void MultiQueryAppendC4Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
+        dim3 grids_merge(sm_count * 4,
                          num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
@@ -1506,7 +1506,7 @@ void MultiQueryAppendC4Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
+        dim3 grids_merge(sm_count * 4,
                           num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,

diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
@@ -1275,7 +1275,7 @@ void MultiQueryAppendC8Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
+        dim3 grids_merge(sm_count * 4,
                          num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
@@ -1573,7 +1573,7 @@ void MultiQueryAppendC8Attention(
       } else {
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
-        dim3 grids_merge(min(sm_count * 4, token_num),
+        dim3 grids_merge(sm_count * 4,
                           num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,

diff --git a/custom_ops/gpu_ops/append_attn/append_attention_func.cuh b/custom_ops/gpu_ops/append_attn/append_attention_func.cuh
@@ -2418,6 +2418,9 @@ __global__ void merge_multi_chunks_v2_kernel(
   __shared__ float md_smem[bdy * 2];
   for (int qid = blockIdx.x; qid < token_num; qid += gridDim.x) {
     const uint32_t bid = batch_id_per_token[qid];
+    if(bid == -1){
+      continue;
+    }
     const uint32_t local_seq_id = qid - cu_seqlens_q[bid];
     const int seq_len_q = seq_lens_q[bid];
     if (seq_len_q == 0) continue;
@@ -2437,6 +2440,8 @@ __global__ void merge_multi_chunks_v2_kernel(
     const int num_chunks_this_seq = div_up(seq_len_kv, chunk_size);
     if (num_chunks_this_seq <= 1) {
       continue;
+    }else if (!ENABLE_PREFILL){
+      continue;
     }
 
     using LoadT = AlignedVector<T, vec_size>;

diff --git a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
@@ -232,113 +232,113 @@ void CascadeAppendAttentionKernel(
                                                 enable_prefill,
                                                 stream,
                                                 out);
-    } else if (cache_quant_type_str == "cache_int8") {
-        CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                cache_quant_type_str,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
-        CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                cache_quant_type_str,
-                                                stream,
-                                                out);
-    } else if (cache_quant_type_str == "cache_int4_zp") {
-        CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
-                                                qkv,
-                                                cache_k,
-                                                cache_v,
-                                                attn_mask,
-                                                cache_k_scale,
-                                                cache_v_scale,
-                                                cache_k_zp,
-                                                cache_v_zp,
-                                                shift_bias,
-                                                smooth_weight,
-                                                seq_lens_q,
-                                                seq_lens_kv,
-                                                seq_lens_encoder,
-                                                batch_id_per_token,
-                                                cu_seqlens_q,
-                                                block_table,
-                                                batch_ids,
-                                                tile_ids_per_batch,
-                                                num_blocks,
-                                                block_shape_q,
-                                                max_seq_len,
-                                                max_dec_len,
-                                                quant_max_bound,
-                                                quant_min_bound,
-                                                in_scale,
-                                                max_partition_size,
-                                                encoder_max_partition_size,
-                                                speculate_max_draft_token_num,
-                                                causal,
-                                                is_decoder,
-                                                enable_prefill,
-                                                stream,
-                                                out);
+    // } else if (cache_quant_type_str == "cache_int8") {
+    //     CascadeAppendAttentionC8Kernel<T, OutT>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             cache_quant_type_str,
+    //                                             stream,
+    //                                             out);
+    // } else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
+    //     CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             cache_quant_type_str,
+    //                                             stream,
+    //                                             out);
+    // } else if (cache_quant_type_str == "cache_int4_zp") {
+    //     CascadeAppendAttentionC4Kernel<T, OutT>(meta_data,
+    //                                             qkv,
+    //                                             cache_k,
+    //                                             cache_v,
+    //                                             attn_mask,
+    //                                             cache_k_scale,
+    //                                             cache_v_scale,
+    //                                             cache_k_zp,
+    //                                             cache_v_zp,
+    //                                             shift_bias,
+    //                                             smooth_weight,
+    //                                             seq_lens_q,
+    //                                             seq_lens_kv,
+    //                                             seq_lens_encoder,
+    //                                             batch_id_per_token,
+    //                                             cu_seqlens_q,
+    //                                             block_table,
+    //                                             batch_ids,
+    //                                             tile_ids_per_batch,
+    //                                             num_blocks,
+    //                                             block_shape_q,
+    //                                             max_seq_len,
+    //                                             max_dec_len,
+    //                                             quant_max_bound,
+    //                                             quant_min_bound,
+    //                                             in_scale,
+    //                                             max_partition_size,
+    //                                             encoder_max_partition_size,
+    //                                             speculate_max_draft_token_num,
+    //                                             causal,
+    //                                             is_decoder,
+    //                                             enable_prefill,
+    //                                             stream,
+    //                                             out);
     } else {
         PD_THROW(
             "cache_quant_type_str should be one of [none, cache_int8, "