fix: postprocess for speculative decode

sunlei1024 · sunlei1024 · commit 2c56b6dc7165 · 2025-10-17T03:51:45.000Z
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
@@ -75,7 +75,7 @@ def __init__(self, cfg, cached_generated_tokens, engine_worker_queue, split_conn
                 self.output_scores = paddle.full(
                     shape=[MAX_BSZ * MAX_DRAFT_TOKENS * (K + 1), 1], fill_value=0.0, dtype="float32"
                 )
-                self.output_ranks  = paddle.full(shape=[MAX_BSZ * MAX_DRAFT_TOKENS], fill_value=0, dtype="int64")
+                self.output_ranks = paddle.full(shape=[MAX_BSZ * MAX_DRAFT_TOKENS], fill_value=0, dtype="int64")
             else:
                 self.output_tokens = paddle.full(
                     shape=[SPECULATE_MAX_BSZ * MAX_DRAFT_TOKENS + SPECULATE_MAX_BSZ + 2],
@@ -85,7 +85,7 @@ def __init__(self, cfg, cached_generated_tokens, engine_worker_queue, split_conn
         elif self.use_logprobs:
             self.output_tokens = paddle.full(shape=[MAX_BSZ * (K + 1) + 2, 1], fill_value=2, dtype="int64")
             self.output_scores = paddle.full(shape=[MAX_BSZ * (K + 1), 1], fill_value=0.0, dtype="float32")
-            self.output_ranks  = paddle.full(shape=[MAX_BSZ], fill_value=0, dtype="int64")
+            self.output_ranks = paddle.full(shape=[MAX_BSZ], fill_value=0, dtype="int64")
         else:
             self.output_tokens = paddle.full(shape=[MAX_BSZ + 2, 1], fill_value=2, dtype="int64")
         self.worker = None
@@ -323,29 +323,35 @@ def process_sampling_results(self):
                 get_output_ep,
                 get_output_topk,
                 speculate_get_output,
+                speculate_get_output_topk,
             )
         rank_id = self.cfg.parallel_config.local_data_parallel_id
 
         while True:
             try:
                 is_blocking = True
                 if self.speculative_decoding:
-                    if (
-                        self.cfg.parallel_config.enable_expert_parallel
-                        and self.cfg.parallel_config.data_parallel_size > 1
-                    ):
-                        if self.use_logprobs:
-                            # TODO speculate_get_output_with_topk
-                            pass
-                        else:
-                            speculate_get_output(self.output_tokens, rank_id, is_blocking, True)
-                    elif self.use_logprobs:
-                        # TODO speculate_get_output_with_topk
-                        pass
+                    if self.use_logprobs:
+                        speculate_get_output_topk(
+                            self.output_tokens,
+                            self.output_scores,
+                            self.output_ranks,
+                            K,
+                            rank_id,
+                            is_blocking,
+                        )
+                        if self.output_tokens[0, 0] == -2:
+                            continue
                     else:
-                        speculate_get_output(self.output_tokens, rank_id, is_blocking, False)
-                    if self.output_tokens[0] == -2:
-                        continue
+                        if (
+                            self.cfg.parallel_config.enable_expert_parallel
+                            and self.cfg.parallel_config.data_parallel_size > 1
+                        ):
+                            speculate_get_output(self.output_tokens, rank_id, is_blocking, True)
+                        else:
+                            speculate_get_output(self.output_tokens, rank_id, is_blocking, False)
+                            if self.output_tokens[0] == -2:
+                                continue
                 else:
                     if self.use_logprobs:
                         get_output_topk(
@@ -400,18 +406,21 @@ def postprocess(self, batch_result: List[RequestOutput], mtype=3):
         try:
             if self.cfg.speculative_config.method and self.use_logprobs:
                 if mtype == 3:  # target
-                    has_finished = any(r.finished for r in batch_result)
-                    if has_finished:
+                    finished_batch_result, unfinished_batch_result = [], []
+                    for r in batch_result:
+                        (finished_batch_result if r.finished else unfinished_batch_result).append(r)
+                    if finished_batch_result:
                         self.cached_generated_tokens.put_results(batch_result)
                     else:
-                        self._batch_result_buffer = batch_result
+                        self._batch_result_buffer = unfinished_batch_result
                 elif mtype == 4:  # draft
                     target_batch_result = []
                     draft_batch_result = batch_result
-                    for target, decode in zip(self._batch_result_buffer, draft_batch_result):
-                        target.outputs.draft_top_logprobs = decode.outputs.draft_top_logprobs
-                        target_batch_result.append(target)
-                    self._batch_result_buffer = None
+                    if self._batch_result_buffer is not None:
+                        for target, decode in zip(self._batch_result_buffer, draft_batch_result):
+                            target.outputs.draft_top_logprobs = decode.outputs.draft_top_logprobs
+                            target_batch_result.append(target)
+                        self._batch_result_buffer = None
                     self.cached_generated_tokens.put_results(target_batch_result)
                 else:
                     self.cached_generated_tokens.put_results(batch_result)
@@ -671,12 +680,13 @@ def _process_batch_output(self):
                                 result.outputs.draft_top_logprobs.logprob_token_ids.extend([topk_token_ids])
                                 result.outputs.draft_top_logprobs.logprobs.extend([topk_logprobs])
                                 result.outputs.draft_top_logprobs.sampled_token_ranks.extend([sampled_rank])
-                if token_id in task.eos_token_ids or is_prefill or recovery_stop:
+                if mtype == 3 and (token_id in task.eos_token_ids or is_prefill or recovery_stop):
                     result.finished = True
                     if recovery_stop:
                         result.error_msg = "Recover is not supported, the result is incomplete!"
                     llm_logger.info(
-                        f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}."
+                        f"Request: {task_id} finished, number of "
+                        f"generated tokens: {self.tokens_counter[task_id]}, token_id:{token_id},is_prefill:{is_prefill},recovery_stop:{recovery_stop}"
                     )
                     llm_logger.info(
                         f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}"