Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4bc170c
[Feature] support prefix cache in DP
Oct 10, 2025
1851e80
fix
Oct 10, 2025
983a8a8
Merge branch 'develop' into update_ep
ltd0924 Oct 10, 2025
2a9e046
Update common_engine.py
ltd0924 Oct 10, 2025
58fced4
Merge branch 'develop' into update_ep
ltd0924 Oct 10, 2025
f5733dd
Update common_engine.py
ltd0924 Oct 10, 2025
427cf47
Update common_engine.py
ltd0924 Oct 10, 2025
00bdcc2
Update common_engine.py
ltd0924 Oct 10, 2025
0822d63
[BugFix] fix workers more than 1
Oct 11, 2025
85b6990
Merge branch 'develop' into update_ep
ltd0924 Oct 11, 2025
0acf059
fix
Oct 11, 2025
667d146
Update api_server.py
ltd0924 Oct 12, 2025
a03dfe6
fix
Oct 12, 2025
141abd7
Update api_server.py
ltd0924 Oct 13, 2025
1f07ecd
fix
Oct 13, 2025
a531165
Merge branch 'develop' into update_ep
ltd0924 Oct 13, 2025
3670530
Merge branch 'develop' into update_ep
ltd0924 Oct 14, 2025
ab6f741
Merge branch 'develop' into update_ep
ltd0924 Oct 15, 2025
90cd313
[Fearture] Support mm model close prefix cache
Oct 16, 2025
e60d098
Merge branch 'develop' into update_ep
ltd0924 Oct 16, 2025
0053f17
Update api_server.py
ltd0924 Oct 16, 2025
03d9f22
Update engine_client.py
ltd0924 Oct 20, 2025
9ade15e
Update engine_client.py
ltd0924 Oct 20, 2025
7a93f0c
add test
Oct 20, 2025
a38a272
Merge branch 'develop' into update_ep
ltd0924 Oct 20, 2025
842cde7
Update test_chat.py
ltd0924 Oct 20, 2025
bd4ec3c
fix
Oct 20, 2025
b3112ba
Merge branch 'develop' into update_ep
ltd0924 Oct 20, 2025
33d9093
fix
Oct 20, 2025
334d29a
Update test_chat.py
ltd0924 Oct 20, 2025
6957bdc
Update test_chat.py
ltd0924 Oct 20, 2025
2cdcc02
Merge branch 'develop' into update_ep
Jiang-Jia-Jun Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions fastdeploy/entrypoints/engine_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __init__(
architectures = ModelConfig({"model": model_name_or_path}).architectures[0]
if MultimodalRegistry.contains_model(architectures):
self.enable_mm = True
self.disable_prefix_mm = MultimodalRegistry.contains_mm_disable_prefix_cache_model(architectures)
else:
self.enable_mm = False

Expand Down Expand Up @@ -158,6 +159,16 @@ async def format_and_add_data(self, prompts: dict):
await self.add_requests(prompts)
return prompts["prompt_token_ids"]

def _check_mm_disable_prefix_cache(self, task):
is_multimodal_data = False
if self.disable_prefix_mm:
multimodal_inputs = task.get("multimodal_inputs", [])
if multimodal_inputs:
token_type_ids = multimodal_inputs.get("token_type_ids", [])
if token_type_ids:
is_multimodal_data = np.sum(token_type_ids) > 0
return is_multimodal_data

async def add_requests(self, task):
"""
Add a new request to the queue.
Expand All @@ -180,6 +191,15 @@ async def add_requests(self, task):
else:
self.data_processor.process_request_dict(task, self.max_model_len)

if self.enable_mm and self.enable_prefix_caching:
if self._check_mm_disable_prefix_cache(task):
api_server_logger.error(
f"Current model doesn't support multimodal data with prefix caching, {task}"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache

报错信息改成这个吧

)
raise EngineError(
"Current model doesn't support multimodal data with prefix caching", error_code=400
)

task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
input_ids_len = task["prompt_token_ids_len"]
task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens"))
Expand Down
11 changes: 11 additions & 0 deletions fastdeploy/multimodal/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,20 @@ class MultimodalRegistry:
"Ernie5ForCausalLM",
}

mm_disable_prefix_cache_models: set[str] = {
"Ernie5ForCausalLM",
}

@classmethod
def contains_model(cls, name: str) -> bool:
"""
Check if the given name exists in registry.
"""
return name in cls.mm_models

@classmethod
def contains_mm_disable_prefix_cache_model(cls, name: str) -> bool:
"""
Check if the given name exists in registry.
"""
return name in cls.mm_disable_prefix_cache_models
2 changes: 1 addition & 1 deletion fastdeploy/output/token_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def _process_batch_output_use_zmq(self, receive_datas):
if self.tokens_counter[task_id] == 0:
if task.messages is not None:
result.prompt = task.messages
result.num_cached_tokens = task.num_cached_tokens
result.num_cached_tokens = task.num_cached_tokens

is_prefill = task.disaggregate_info is not None and task.disaggregate_info["role"] == "prefill"
result = self._process_per_token(task, i, token_ids, result, is_prefill)
Expand Down
Loading