PaddlePaddle · Jiang-Jia-Jun · Oct 9, 2025 · Aug 20, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/custom_ops/gpu_ops/append_attn/append_attention_func.cuh b/custom_ops/gpu_ops/append_attn/append_attention_func.cuh
@@ -2418,6 +2418,9 @@ __global__ void merge_multi_chunks_v2_kernel(
   __shared__ float md_smem[bdy * 2];
   for (int qid = blockIdx.x; qid < token_num; qid += gridDim.x) {
     const uint32_t bid = batch_id_per_token[qid];
+    if(bid == -1){
+      continue;
+    }
     const uint32_t local_seq_id = qid - cu_seqlens_q[bid];
     const int seq_len_q = seq_lens_q[bid];
     if (seq_len_q == 0) continue;
@@ -2437,6 +2440,8 @@ __global__ void merge_multi_chunks_v2_kernel(
     const int num_chunks_this_seq = div_up(seq_len_kv, chunk_size);
     if (num_chunks_this_seq <= 1) {
       continue;
+    }else if (!ENABLE_PREFILL){
+      continue;
     }
 
     using LoadT = AlignedVector<T, vec_size>;

diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
@@ -84,15 +84,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel(
     const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
     const int block_idx = block_table_now[write_seq_id / block_size];
     if (block_idx < 0) {
-      printf(
-          "Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var "
-          "%d %d %d %d\n",
-          block_idx,
-          write_seq_id,
-          ori_bi,
-          seq_lens_decoder[ori_bi],
-          token_id,
-          cu_seqlens_q[ori_bi]);
+      return ;  // NOTE(gongshaotian): For CUDAGraph padding
     }
     const int block_offset = write_seq_id % block_size;
 
@@ -390,15 +382,7 @@ __global__ void append_speculate_cache_rope_kernel(
     const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
     const int block_idx = block_table_now[write_seq_id / block_size];
     if (block_idx < 0) {
-      printf(
-          "Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var "
-          "%d %d %d %d\n",
-          block_idx,
-          write_seq_id,
-          ori_bi,
-          seq_lens_decoder[ori_bi],
-          token_id,
-          cu_seqlens_q[ori_bi]);
+      return ;  // NOTE(gongshaotian): For CUDAGraph padding
     }
     const int block_offset = write_seq_id % block_size;
 
@@ -525,15 +509,7 @@ __global__ void append_speculate_cache_neox_rope_kernel(
     const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
     const int block_idx = block_table_now[write_seq_id / block_size];
     if (block_idx < 0) {
-      printf(
-          "Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var "
-          "%d %d %d %d\n",
-          block_idx,
-          write_seq_id,
-          ori_bi,
-          seq_lens_decoder[ori_bi],
-          token_id,
-          cu_seqlens_q[ori_bi]);
+      return ;  // NOTE(gongshaotian): For CUDAGraph padding
     }
     const int block_offset = write_seq_id % block_size;
 

diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -682,7 +682,7 @@ void SpeculateVerify(
     const paddle::Tensor &output_cum_offsets,
     const paddle::Tensor &actual_candidate_len,
     const paddle::Tensor &actual_draft_token_nums, const paddle::Tensor &topp,
-    int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode);
+    int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode, bool accept_all_drafts);
 
 void SpeculateUpdate(const paddle::Tensor &seq_lens_encoder,
                        const paddle::Tensor &seq_lens_decoder,

diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -130,7 +130,6 @@ std::vector<paddle::Tensor> rebuild_padding(
     int pack_num = elem_nums / PackSize;
     const int blocksize = 128;
     const int grid_size = (pack_num + blocksize - 1) / blocksize;
-
     if (output_padding_offset) {
         RebuildAppendPaddingKernel<DataType_, PackSize>
             <<<grid_size, blocksize, 0, cu_stream>>>(

diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu
@@ -139,7 +139,6 @@ std::vector<paddle::DataType> SpeculateGetPaddingOffsetInferDtype(
 PD_BUILD_STATIC_OP(speculate_get_padding_offset)
     .Inputs({"input_ids",
              "draft_tokens",
-             "cum_offsets"
              "token_num",
              "seq_len",
              "seq_lens_encoder"})

diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_verify.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_verify.cu
@@ -73,7 +73,7 @@ __global__ void speculate_verify(
     const int *output_cum_offsets, const int *actual_candidate_len,
     const int real_bsz, const int max_draft_tokens, const int end_length,
     const int max_seq_len, const int max_candidate_len, const int verify_window,
-    const bool prefill_one_step_stop, const bool benchmark_mode) {
+    const bool prefill_one_step_stop, const bool benchmark_mode, const bool accept_all_drafts) {
   const int bid = threadIdx.x;
   // verify and set stop flags
   int accept_num_now = 1;
@@ -101,6 +101,24 @@ __global__ void speculate_verify(
         if (seq_lens_encoder[bid] != 0) {
           break;
         }
+        if (accept_all_drafts) {
+          // accept all draft tokens
+          step_idx[bid]++;
+          auto accept_token = draft_tokens_now[i + 1];
+          accept_tokens[bid * max_draft_tokens + i] = accept_token;
+
+          if (is_in_end(accept_token, end_tokens, end_length) ||
+              step_idx[bid] >= max_dec_len[bid]) {
+            stop_flags[bid] = true;
+            stop_flag_now_int = 1;
+            if (step_idx[bid] >= max_dec_len[bid])
+              accept_tokens[bid * max_draft_tokens + i] = end_tokens[0];
+            break;
+          } else {
+            accept_num_now++;
+          }
+          continue;
+        }
         if (USE_TOPK) {
           if (verify_tokens_now[i * max_candidate_len] ==
               draft_tokens_now[i + 1]) {
@@ -249,7 +267,7 @@ void SpeculateVerify(
     const paddle::Tensor &output_cum_offsets,
     const paddle::Tensor &actual_candidate_len,
     const paddle::Tensor &actual_draft_token_nums, const paddle::Tensor &topp,
-    int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode) {
+    int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode, bool accept_all_drafts) {
   //   printf("Enter speculate update\n");
   auto bsz = accept_tokens.shape()[0];
   int real_bsz = seq_lens_this_time.shape()[0];
@@ -292,7 +310,7 @@ void SpeculateVerify(
           is_block_step.data<bool>(), output_cum_offsets.data<int>(),
           actual_candidate_len.data<int>(), real_bsz, max_draft_tokens,
           end_length, max_seq_len, max_candidate_len, verify_window,
-          prefill_one_step_stop, benchmark_mode);
+          prefill_one_step_stop, benchmark_mode, accept_all_drafts);
     } else {
       speculate_verify<false, true>
           <<<1, BlockSize, 0, accept_tokens.stream()>>>(
@@ -308,7 +326,7 @@ void SpeculateVerify(
               end_tokens.data<int64_t>(), is_block_step.data<bool>(),
               output_cum_offsets.data<int>(), actual_candidate_len.data<int>(),
               real_bsz, max_draft_tokens, end_length, max_seq_len,
-              max_candidate_len, verify_window, prefill_one_step_stop, benchmark_mode);
+              max_candidate_len, verify_window, prefill_one_step_stop, benchmark_mode, accept_all_drafts);
     }
   } else {
     if (enable_topp) {
@@ -326,7 +344,7 @@ void SpeculateVerify(
               end_tokens.data<int64_t>(), is_block_step.data<bool>(),
               output_cum_offsets.data<int>(), actual_candidate_len.data<int>(),
               real_bsz, max_draft_tokens, end_length, max_seq_len,
-              max_candidate_len, verify_window, prefill_one_step_stop, benchmark_mode);
+              max_candidate_len, verify_window, prefill_one_step_stop, benchmark_mode, accept_all_drafts);
     } else {
       speculate_verify<false, false>
           <<<1, BlockSize, 0, accept_tokens.stream()>>>(
@@ -342,7 +360,7 @@ void SpeculateVerify(
               end_tokens.data<int64_t>(), is_block_step.data<bool>(),
               output_cum_offsets.data<int>(), actual_candidate_len.data<int>(),
               real_bsz, max_draft_tokens, end_length, max_seq_len,
-              max_candidate_len, verify_window, prefill_one_step_stop, benchmark_mode);
+              max_candidate_len, verify_window, prefill_one_step_stop, benchmark_mode, accept_all_drafts);
     }
   }
 
@@ -357,7 +375,7 @@ PD_BUILD_STATIC_OP(speculate_verify)
              "actual_candidate_len", "actual_draft_token_nums", "topp"})
     .Outputs({"accept_tokens_out", "accept_num_out", "step_idx_out",
               "stop_flags_out"})
-    .Attrs({"max_seq_len: int", "verify_window: int", "enable_topp: bool", "benchmark_mode: bool"})
+    .Attrs({"max_seq_len: int", "verify_window: int", "enable_topp: bool", "benchmark_mode: bool","accept_all_drafts: bool"})
     .SetInplaceMap({{"accept_tokens", "accept_tokens_out"},
                     {"accept_num", "accept_num_out"},
                     {"step_idx", "step_idx_out"},

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1434,6 +1434,11 @@ def __init__(
 
         if self.graph_opt_config.cudagraph_only_prefill:
             self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
+        elif self.speculative_config.method == "mtp":
+            max_shape = self.parallel_config.max_num_seqs * (self.speculative_config.num_model_steps + 1)
+            if max_shape % 2 == 1:
+                max_shape = max_shape + 1
+            self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=min(512, max_shape))
         else:
             self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
 

diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py
@@ -167,7 +167,7 @@ def format_str(obj):
                     "shape": obj.shape,
                     "dtype": str(obj.dtype),
                     "place": str(obj.place),
-                    # "content": obj if obj.numel()<10 else "Too big to show"
+                    "content": obj if obj.numel() < 70 else "Too big to show",
                 }
                 return tensor_info
             elif isinstance(obj, (list, tuple)):

diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -111,7 +111,7 @@ def run_static_model(self, entry: ConcreteSizeEntry, **kwargs):
                 entry.num_finished_warmup += 1
                 entry.runnable(**kwargs)
                 logger.debug(
-                    f"[CUDA GRAPH] Warm up for batch size {entry.real_shape}, "
+                    f"[CUDA GRAPH][ID:{id(self)}] Warm up for batch size {entry.real_shape}, "
                     f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                 )
 
@@ -138,15 +138,15 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
         real_shape = ids_remove_padding.shape[0]
         padding_real_shape = self.real_shape_to_captured_size[real_shape]
         logger.debug(
-            f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
-            f"The padded shape is :{padding_real_shape}"
+            f"[CUDA GRAPH][ID:{id(self)}] The actual real shape obtained by CUDAGraph is :{real_shape}, "
+            f"The padded shape is :{padding_real_shape}, If Padding :{real_shape != padding_real_shape}"
         )
 
         entry = self.concrete_size_entries.get(padding_real_shape)
         assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
         if entry.runnable is None:
             entry.runnable = self.runnable
-            logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
+            logger.debug(f"[CUDA GRAPH][ID:{id(self)}] New entry lazy initialize with real shape {padding_real_shape}")
 
         if not entry.use_cudagraph:
             return entry.runnable(**kwargs)
@@ -161,7 +161,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
                 entry.num_finished_warmup += 1
                 entry.runnable(**kwargs)
                 logger.debug(
-                    f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
+                    f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
                     f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                 )
 
@@ -196,11 +196,11 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
 
             # For CUDAGraph debug
             # self._save_cudagrpah_dot_files(entry)
-            logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
+            logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
 
         # Replay
         entry.cuda_graph.replay()
-        logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
+        logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph replayed for real shape {padding_real_shape}")
         if len(entry.output_buffers) == 1:
             return entry.output_buffers[0]
         return entry.output_buffers
@@ -213,8 +213,9 @@ def _create_entry_dict(self):
         for shape in self.cudagraph_capture_sizes:
             self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)
 
-        logger.info(
-            f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all real shape entry."
+        logger.debug(
+            f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
+            "Created all real shape entry."
         )
 
     def clear_graph(self):
@@ -223,7 +224,7 @@ def clear_graph(self):
         for id, entry in self.concrete_size_entries.items():
             if entry.cuda_graph:
                 del entry.cuda_graph
-                logger.debug(f"[CUDA GRAPH] The CUDAGraph with shape {id} has been cleared.")
+                logger.debug(f"[CUDA GRAPH][ID:{id(self)}] The CUDAGraph with shape {id} has been cleared.")
 
         del self.concrete_size_entries
         paddle.device.cuda.empty_cache()

diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
@@ -115,7 +115,7 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
         self.runnable = runnable
         self.fd_config = fd_config
 
-        self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
+        self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
         if self.fd_config.graph_opt_config.graph_opt_level > 0:
             # 1. Prepare cuda graph input buffers (contain output of subgraphs)
 
@@ -138,9 +138,9 @@ def __call__(self, **kwargs):
             )
 
         assert kwargs["forward_meta"].ids_remove_padding is not None
-        batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0]
+        real_shape = kwargs["forward_meta"].ids_remove_padding.shape[0]
 
-        if (not kwargs["forward_meta"].step_use_cudagraph) or (batch_size > self.max_captre_batch):
+        if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.max_captre_size):
             return self.runnable(**kwargs)
         else:
             return self.cudagraph_piecewise_backend.__call__(**kwargs)

diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -461,6 +461,7 @@ def forward_cuda(
         sampling_metadata: SamplingMetadata,
         max_model_len: int,
         share_inputs: List[paddle.Tensor],
+        accept_all_drafts: bool = False,
     ) -> paddle.Tensor:
         """ """
 
@@ -517,6 +518,7 @@ def forward_cuda(
             self.speculative_verify_window,
             True,  # enable_topp
             self.speculative_benchmark_mode,
+            accept_all_drafts,
         )
 
         return None

diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py
@@ -28,6 +28,9 @@
 
 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.forward_meta import ForwardMeta
+from fastdeploy.model_executor.graph_optimization.decorator import (
+    support_graph_optimization,
+)
 from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection
 from fastdeploy.model_executor.layers.normalization import RMSNorm
 from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
@@ -234,6 +237,7 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, moe_layer_st
         return mappings
 
 
+@support_graph_optimization
 class Ernie4_5_MTPModel(nn.Layer):
     """
     Ernie4_5_MTPModel
@@ -457,6 +461,10 @@ def forward(
         """
         forward
         """
-        hidden_states = self.ernie(ids_remove_padding, previous_hidden_states, forward_meta)
+        hidden_states = self.ernie(
+            ids_remove_padding=ids_remove_padding,
+            previous_hidden_states=previous_hidden_states,
+            forward_meta=forward_meta,
+        )
 
         return hidden_states
diff --git a/fastdeploy/spec_decode/base.py b/fastdeploy/spec_decode/base.py
@@ -33,24 +33,25 @@ class Proposer(ABC):
     the speculative decoding framework
     """
 
-    def __init__(self, cfg: FDConfig):
+    def __init__(self, fd_config: FDConfig):
         """
         Init Speculative proposer
         """
-        cfg.parallel_config.tp_group = None
-        self.cfg = deepcopy(cfg)
-        cfg.parallel_config.tp_group = dist.get_group(
-            cfg.parallel_config.data_parallel_rank + envs.FD_TP_GROUP_GID_OFFSET
+        fd_config.parallel_config.tp_group = None
+        self.fd_config = deepcopy(fd_config)
+        fd_config.parallel_config.tp_group = dist.get_group(
+            fd_config.parallel_config.data_parallel_rank + envs.FD_TP_GROUP_GID_OFFSET
         )
-        self.cfg.parallel_config.tp_group = dist.get_group(
-            cfg.parallel_config.data_parallel_rank + envs.FD_TP_GROUP_GID_OFFSET
+        self.fd_config.parallel_config.tp_group = dist.get_group(
+            fd_config.parallel_config.data_parallel_rank + envs.FD_TP_GROUP_GID_OFFSET
         )
-        self.parallel_config = self.cfg.parallel_config
-        self.model_config = self.cfg.model_config
-        self.speculative_config = self.cfg.speculative_config
-        self.cache_config = self.cfg.cache_config
-        self.quant_config = self.cfg.quant_config
-        self.scheduler_config = self.cfg.scheduler_config
+        self.parallel_config = self.fd_config.parallel_config
+        self.model_config = self.fd_config.model_config
+        self.speculative_config = self.fd_config.speculative_config
+        self.cache_config = self.fd_config.cache_config
+        self.quant_config = self.fd_config.quant_config
+        self.graph_opt_config = self.fd_config.graph_opt_config
+        self.scheduler_config = self.fd_config.scheduler_config
 
         self.max_num_seqs = self.scheduler_config.max_num_seqs
         self.max_model_len = self.parallel_config.max_model_len