PaddlePaddle · Jiang-Jia-Jun · Oct 21, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
@@ -73,6 +73,7 @@ def __init__(
         architectures = ModelConfig({"model": model_name_or_path}).architectures[0]
         if MultimodalRegistry.contains_model(architectures):
             self.enable_mm = True
+            self.disable_prefix_mm = MultimodalRegistry.contains_mm_disable_prefix_cache_model(architectures)
         else:
             self.enable_mm = False
 
@@ -158,6 +159,16 @@ async def format_and_add_data(self, prompts: dict):
         await self.add_requests(prompts)
         return prompts["prompt_token_ids"]
 
+    def _check_mm_disable_prefix_cache(self, task):
+        is_multimodal_data = False
+        if self.disable_prefix_mm:
+            multimodal_inputs = task.get("multimodal_inputs", [])
+            if multimodal_inputs:
+                token_type_ids = multimodal_inputs.get("token_type_ids", [])
+                if token_type_ids:
+                    is_multimodal_data = np.sum(token_type_ids) > 0
+        return is_multimodal_data
+
     async def add_requests(self, task):
         """
         Add a new request to the queue.
@@ -180,6 +191,15 @@ async def add_requests(self, task):
             else:
                 self.data_processor.process_request_dict(task, self.max_model_len)
 
+            if self.enable_mm and self.enable_prefix_caching:
+                if self._check_mm_disable_prefix_cache(task):
+                    api_server_logger.error(
+                        f"Current model doesn't support multimodal data with prefix caching, {task}"
+                    )
+                    raise EngineError(
+                        "Current model doesn't support multimodal data with prefix caching", error_code=400
+                    )
+
             task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
             input_ids_len = task["prompt_token_ids_len"]
             task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens"))

diff --git a/fastdeploy/multimodal/registry.py b/fastdeploy/multimodal/registry.py
@@ -27,9 +27,20 @@ class MultimodalRegistry:
         "Ernie5ForCausalLM",
     }
 
+    mm_disable_prefix_cache_models: set[str] = {
+        "Ernie5ForCausalLM",
+    }
+
     @classmethod
     def contains_model(cls, name: str) -> bool:
         """
         Check if the given name exists in registry.
         """
         return name in cls.mm_models
+
+    @classmethod
+    def contains_mm_disable_prefix_cache_model(cls, name: str) -> bool:
+        """
+        Check if the given name exists in registry.
+        """
+        return name in cls.mm_disable_prefix_cache_models
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
@@ -258,7 +258,7 @@ def _process_batch_output_use_zmq(self, receive_datas):
             if self.tokens_counter[task_id] == 0:
                 if task.messages is not None:
                     result.prompt = task.messages
-                result.num_cached_tokens = task.num_cached_tokens
+            result.num_cached_tokens = task.num_cached_tokens
 
             is_prefill = task.disaggregate_info is not None and task.disaggregate_info["role"] == "prefill"
             result = self._process_per_token(task, i, token_ids, result, is_prefill)