Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fastdeploy/collect_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ def get_env_info():

if PADDLE_AVAILABLE:
paddle_version_str = paddle.__version__
paddle_cuda_available_str = str(torch.cuda.is_available())
paddle_cuda_available_str = str(paddle.device.is_compiled_with_cuda())
paddle_cuda_version_str = str(paddle.version.cuda())
else:
version_str = paddle_cuda_available_str = cuda_version_str = "N/A"
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/engine/common_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,7 @@ def _exit_sub_services(self):
"""
exit sub services
"""
llm_logger.info("Exit sub services.....")
self.running = False
if hasattr(self, "engine_worker_queue_server") and self.engine_worker_queue_server is not None:
self.engine_worker_queue_server.cleanup()
Expand Down
3 changes: 3 additions & 0 deletions fastdeploy/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class CompletionTokenUsageInfo(BaseModel):
"""

reasoning_tokens: Optional[int] = None
image_tokens: Optional[int] = None


class PromptTokenUsageInfo(BaseModel):
Expand All @@ -74,6 +75,8 @@ class PromptTokenUsageInfo(BaseModel):
"""

cached_tokens: Optional[int] = None
image_tokens: Optional[int] = None
video_tokens: Optional[int] = None


class UsageInfo(BaseModel):
Expand Down
5 changes: 5 additions & 0 deletions fastdeploy/entrypoints/openai/response_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from typing import Any, List, Optional

from fastdeploy.entrypoints.openai.usage_calculator import count_tokens
from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
from fastdeploy.utils import api_server_logger

Expand Down Expand Up @@ -104,6 +105,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
image_output = self._end_image_code_request_output
image_output["outputs"]["multipart"] = [image]
image_output["outputs"]["token_ids"] = all_tokens
image_output["outputs"]["num_image_tokens"] = count_tokens(all_tokens)
yield image_output

self.data_processor.process_response_dict(
Expand All @@ -124,6 +126,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
token_ids = request_output["outputs"]["token_ids"]
if token_ids[-1] == self.eos_token_id:
multipart = []
num_image_tokens = 0
for part in self._multipart_buffer:
if part["decode_type"] == 0:
self.data_processor.process_response_dict(
Expand All @@ -139,6 +142,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
if self.decoder_client:
req_id = part["request_output"]["request_id"]
all_tokens = part["request_output"]["outputs"]["token_ids"]
num_image_tokens += count_tokens(all_tokens)

image_ret = await self.decoder_client.decode_image(
request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
Expand All @@ -150,4 +154,5 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,

lasrt_request_output = self._multipart_buffer[-1]["request_output"]
lasrt_request_output["outputs"]["multipart"] = multipart
lasrt_request_output["outputs"]["num_image_tokens"] = num_image_tokens
yield lasrt_request_output
25 changes: 22 additions & 3 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ async def chat_completion_stream_generator(
previous_num_tokens = [0] * num_choices
reasoning_num_tokens = [0] * num_choices
num_prompt_tokens = 0
num_cached_tokens = 0
num_image_tokens = [0] * num_choices
tool_called = [False] * num_choices
max_streaming_response_tokens = (
request.max_streaming_response_tokens
Expand Down Expand Up @@ -321,6 +323,9 @@ async def chat_completion_stream_generator(
output_top_logprobs = output["top_logprobs"]
output_draft_top_logprobs = output["draft_top_logprobs"]
previous_num_tokens[idx] += len(output["token_ids"])
if output.get("num_image_tokens"):
previous_num_tokens[idx] += output.get("num_image_tokens")
num_image_tokens[idx] += output.get("num_image_tokens")
reasoning_num_tokens[idx] += output.get("reasoning_token_num", 0)
logprobs_res: Optional[LogProbs] = None
draft_logprobs_res: Optional[LogProbs] = None
Expand Down Expand Up @@ -389,8 +394,10 @@ async def chat_completion_stream_generator(
prompt_tokens=num_prompt_tokens,
completion_tokens=previous_num_tokens[idx],
total_tokens=num_prompt_tokens + previous_num_tokens[idx],
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
completion_tokens_details=CompletionTokenUsageInfo(
reasoning_tokens=reasoning_num_tokens[idx]
reasoning_tokens=reasoning_num_tokens[idx],
image_tokens=num_image_tokens[idx],
),
)
choices.append(choice)
Expand All @@ -409,7 +416,10 @@ async def chat_completion_stream_generator(
prompt_tokens=num_prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=num_prompt_tokens + completion_tokens,
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=reasoning_tokens),
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
completion_tokens_details=CompletionTokenUsageInfo(
image_tokens=sum(num_image_tokens), reasoning_tokens=reasoning_tokens
),
)
chunk = ChatCompletionStreamResponse(
id=request_id,
Expand Down Expand Up @@ -466,6 +476,7 @@ async def chat_completion_full_generator(
draft_logprob_contents = [[] for _ in range(num_choices)]
completion_token_ids = [[] for _ in range(num_choices)]
num_cached_tokens = [0] * num_choices
num_image_tokens = [0] * num_choices
response_processor = ChatResponseProcessor(
data_processor=self.engine_client.data_processor,
enable_mm_output=self.enable_mm_output,
Expand Down Expand Up @@ -531,6 +542,9 @@ async def chat_completion_full_generator(
if data["finished"]:
num_choices -= 1
reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0)
if data["outputs"].get("image_token_num"):
previous_num_tokens[idx] += data["outputs"].get("image_token_num")
num_image_tokens[idx] = data["outputs"].get("image_token_num")
choice = await self._create_chat_completion_choice(
output=output,
index=idx,
Expand All @@ -540,6 +554,7 @@ async def chat_completion_full_generator(
prompt_tokens=prompt_tokens,
completion_token_ids=completion_token_ids[idx],
num_cached_tokens=num_cached_tokens,
num_image_tokens=num_image_tokens,
logprob_contents=logprob_contents,
response_processor=response_processor,
)
Expand All @@ -557,7 +572,9 @@ async def chat_completion_full_generator(
completion_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens,
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=sum(num_cached_tokens)),
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
completion_tokens_details=CompletionTokenUsageInfo(
reasoning_tokens=num_reasoning_tokens, image_tokens=sum(num_image_tokens)
),
)
choices = sorted(choices, key=lambda x: x.index)
res = ChatCompletionResponse(
Expand All @@ -580,6 +597,7 @@ async def _create_chat_completion_choice(
prompt_tokens: str,
completion_token_ids: list,
num_cached_tokens: list,
num_image_tokens: list,
logprob_contents: list,
response_processor: ChatResponseProcessor,
) -> ChatCompletionResponseChoice:
Expand Down Expand Up @@ -609,6 +627,7 @@ async def _create_chat_completion_choice(
has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
max_tokens = request.max_completion_tokens or request.max_tokens
num_cached_tokens[index] = output.get("num_cached_tokens", 0)
num_image_tokens[index] = output.get("num_image_tokens", 0)

finish_reason = "stop"
if has_no_token_limit or previous_num_tokens != max_tokens:
Expand Down
23 changes: 20 additions & 3 deletions fastdeploy/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
CompletionTokenUsageInfo,
ErrorInfo,
ErrorResponse,
PromptTokenUsageInfo,
UsageInfo,
)
from fastdeploy.utils import (
Expand Down Expand Up @@ -370,6 +371,8 @@ async def completion_stream_generator(
req_id = f"{request_id}_{i}"
dealer.write([b"", req_id.encode("utf-8")]) # 发送多路请求
output_tokens = [0] * num_choices
num_cache_tokens = [0] * num_choices
num_image_tokens = [0] * num_choices
inference_start_time = [0] * num_choices
reasoning_tokens = [0] * num_choices
first_iteration = [True] * num_choices
Expand Down Expand Up @@ -459,7 +462,11 @@ async def completion_stream_generator(
draft_logprobs_res = self._create_completion_logprobs(
output_draft_top_logprobs, request.logprobs, 0
)
output_tokens[idx] += 1
output_tokens[idx] += len(output.get("token_ids", [])) or 0
num_cache_tokens[idx] += output.get("num_cache_tokens") or 0
if output.get("num_image_tokens"):
output_tokens[idx] += output.get("num_image_tokens")
num_image_tokens[idx] += output.get("num_image_tokens")
reasoning_tokens[idx] += output.get("reasoning_token_num", 0)
delta_message = CompletionResponseStreamChoice(
index=idx,
Expand Down Expand Up @@ -527,8 +534,9 @@ async def completion_stream_generator(
prompt_batched_token_ids[idx // (1 if request.n is None else request.n)]
)
+ output_tokens[idx],
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens[idx]),
completion_tokens_details=CompletionTokenUsageInfo(
reasoning_tokens=reasoning_tokens[idx]
image_tokens=num_image_tokens[idx], reasoning_tokens=reasoning_tokens[idx]
),
),
)
Expand Down Expand Up @@ -559,6 +567,8 @@ def request_output_to_completion_response(
choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
num_cache_tokens = 0
num_image_tokens = 0
num_reasoning_tokens = 0

for idx in range(len(final_res_batch)):
Expand Down Expand Up @@ -614,6 +624,10 @@ def request_output_to_completion_response(
num_generated_tokens += final_res["output_token_ids"]

num_prompt_tokens += len(prompt_token_ids)
num_cache_tokens += output.get("num_cache_tokens") or 0
if output.get("num_image_tokens"):
num_generated_tokens += output.get("num_image_tokens")
num_image_tokens += output.get("num_image_tokens")

num_reasoning_tokens += output.get("reasoning_token_num", 0)

Expand All @@ -622,7 +636,10 @@ def request_output_to_completion_response(
prompt_tokens=num_prompt_tokens,
completion_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens,
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens),
completion_tokens_details=CompletionTokenUsageInfo(
reasoning_tokens=num_reasoning_tokens, image_tokens=num_image_tokens
),
)
del request

Expand Down
33 changes: 33 additions & 0 deletions fastdeploy/entrypoints/openai/usage_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import numpy as np


def count_tokens(tokens):
"""
Count the number of tokens in a nested list or array structure.
"""
count = 0
stack = [tokens]
while stack:
current = stack.pop()
if isinstance(current, (list, tuple, np.ndarray)):
for item in reversed(current):
stack.append(item)
else:
count += 1
return count
20 changes: 6 additions & 14 deletions tests/ce/server/test_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,9 @@ def test_unstream_with_logprobs():
"bytes": [231, 137, 155, 233, 161, 191],
"top_logprobs": None,
}
assert resp_json["usage"] == {
"prompt_tokens": 22,
"total_tokens": 25,
"completion_tokens": 3,
"prompt_tokens_details": {"cached_tokens": 0},
"completion_tokens_details": {"reasoning_tokens": 0},
}
assert resp_json["usage"]["prompt_tokens"] == 22
assert resp_json["usage"]["completion_tokens"] == 3
assert resp_json["usage"]["total_tokens"] == 25


def test_unstream_without_logprobs():
Expand All @@ -65,13 +61,9 @@ def test_unstream_without_logprobs():
# 校验返回内容与 logprobs 字段
assert resp_json["choices"][0]["message"]["content"] == "牛顿的"
assert resp_json["choices"][0]["logprobs"] is None
assert resp_json["usage"] == {
"prompt_tokens": 22,
"total_tokens": 25,
"completion_tokens": 3,
"prompt_tokens_details": {"cached_tokens": 0},
"completion_tokens_details": {"reasoning_tokens": 0},
}
assert resp_json["usage"]["prompt_tokens"] == 22
assert resp_json["usage"]["completion_tokens"] == 3
assert resp_json["usage"]["total_tokens"] == 25


def test_stream_with_logprobs():
Expand Down
7 changes: 7 additions & 0 deletions tests/entrypoints/openai/test_max_streaming_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ async def test_create_chat_completion_choice(self):
"reasoning_content": "Normal reasoning",
"tool_call": None,
"num_cached_tokens": 3,
"num_image_tokens": 2,
"raw_prediction": "raw_answer_0",
},
"finished": True,
Expand All @@ -403,6 +404,7 @@ async def test_create_chat_completion_choice(self):
"tool_calls": None,
"raw_prediction": "raw_answer_0",
"num_cached_tokens": 3,
"num_image_tokens": 2,
"finish_reason": "stop",
},
},
Expand All @@ -415,6 +417,7 @@ async def test_create_chat_completion_choice(self):
"reasoning_content": None,
"tool_call": None,
"num_cached_tokens": 0,
"num_image_tokens": 0,
"raw_prediction": None,
},
"finished": True,
Expand All @@ -430,6 +433,7 @@ async def test_create_chat_completion_choice(self):
"tool_calls": None,
"raw_prediction": None,
"num_cached_tokens": 0,
"num_image_tokens": 0,
"finish_reason": "stop",
},
},
Expand All @@ -442,6 +446,7 @@ async def test_create_chat_completion_choice(self):
mock_response_processor.enable_multimodal_content.return_value = False
completion_token_ids = [[], []]
num_cached_tokens = [0, 0]
num_image_tokens = [0, 0]

for idx, case in enumerate(test_cases):
actual_choice = await self.chat_serving._create_chat_completion_choice(
Expand All @@ -453,6 +458,7 @@ async def test_create_chat_completion_choice(self):
prompt_tokens=prompt_tokens,
completion_token_ids=completion_token_ids[idx],
num_cached_tokens=num_cached_tokens,
num_image_tokens=num_image_tokens,
logprob_contents=logprob_contents,
response_processor=mock_response_processor,
)
Expand All @@ -468,6 +474,7 @@ async def test_create_chat_completion_choice(self):
self.assertEqual(actual_choice.message.completion_token_ids, completion_token_ids[idx])

self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"])
self.assertEqual(num_image_tokens[expected["index"]], expected["num_image_tokens"])
self.assertEqual(actual_choice.finish_reason, expected["finish_reason"])
assert actual_choice.logprobs is not None

Expand Down
Loading
Loading