Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
reasoning-parser: ernie_x1
tool_call_parser: ernie_x1
reasoning-parser: ernie-x1
tool_call_parser: ernie-x1
tensor_parallel_size: 4
max_model_len: 65536
max_num_seqs: 128
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
tensor_parallel_size: 1
max_model_len: 131072
max_num_seqs: 32
reasoning_parser: ernie_x1
tool_call_parser: ernie_x1
reasoning_parser: ernie-x1
tool_call_parser: ernie-x1
load_choices: "default_v1"
quantization: wint8
4 changes: 2 additions & 2 deletions docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--quantization wint8 \
--reasoning-parser ernie_x1 \
--tool-call-parser ernie_x1 \
--reasoning-parser ernie-x1 \
--tool-call-parser ernie-x1 \
--max-num-seqs 32
```
- `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
Expand Down
2 changes: 1 addition & 1 deletion docs/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to use Machete for wint4 dense GEMM.
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),

# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),

# Timeout for cache_transfer_manager process exit
Expand Down
4 changes: 2 additions & 2 deletions docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--quantization wint8 \
--reasoning-parser ernie_x1 \
--tool-call-parser ernie_x1 \
--reasoning-parser ernie-x1 \
--tool-call-parser ernie-x1 \
--max-num-seqs 32
```
其中:
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# 是否使用 Machete 后端的 wint4 GEMM.
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),

# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),

# cache_transfer_manager 进程残留时退出等待超时时间
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def get_tool_parser(cls, name) -> type:

Raise a KeyError exception if the name is not registered.
"""
name = name.replace("_", "-")
if name in cls.tool_parsers:
return cls.tool_parsers[name]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def random_tool_call_id() -> str:
from fastdeploy.utils import data_processor_logger


@ToolParserManager.register_module("ernie_45-vl-thinking")
@ToolParserManager.register_module("ernie-45-vl-thinking")
class Ernie45VLThinkingToolParser(ToolParser):
"""
Tool parser for Ernie model version 4.5.1.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def random_tool_call_id() -> str:
from fastdeploy.utils import data_processor_logger


@ToolParserManager.register_module("ernie_x1")
@ToolParserManager.register_module("ernie-x1")
class ErnieX1ToolParser(ToolParser):
"""
Tool parser for Ernie model version 4.5.1.
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
"FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")),
# enable return text, used when FD_ENABLE_INTERNAL_ADAPTER=1
"FD_ENABLE_RETURN_TEXT": lambda: bool(int(os.getenv("FD_ENABLE_RETURN_TEXT", "0"))),
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie-45-vl, \n</think>\n\n for ernie-x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
# Timeout for cache_transfer_manager process exit
"FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),
Expand Down
8 changes: 4 additions & 4 deletions fastdeploy/model_executor/pre_and_post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def limit_thinking_content_length(
line_break_id: int = None,
):
if limit_strategy == "</think>":
# for ernie4_5_vl
# for ernie-45-vl
limit_thinking_content_length_v1(
sampled_token_ids,
max_think_lens,
Expand All @@ -110,7 +110,7 @@ def limit_thinking_content_length(
think_end_id,
)
elif limit_strategy == "\n</think>\n\n":
# for ernie_x1
# for ernie-x1
assert line_break_id > 0
limit_thinking_content_length_v2(
sampled_token_ids,
Expand All @@ -136,7 +136,7 @@ def speculate_limit_thinking_content_length(
line_break_id: int = None,
):
if limit_strategy == "</think>":
# for ernie4_5_vl
# for ernie-45-vl
speculate_limit_thinking_content_length_v1(
accept_tokens,
max_think_lens,
Expand All @@ -147,7 +147,7 @@ def speculate_limit_thinking_content_length(
think_end_id,
)
elif limit_strategy == "\n</think>\n\n":
# for ernie_x1
# for ernie-x1
assert line_break_id > 0
speculate_limit_thinking_content_length_v2(
accept_tokens,
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/reasoning/abs_reasoning_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def get_reasoning_parser(cls, name: Optional[str]) -> type[ReasoningParser]:

Raise a KeyError exception if the name is not registered.
"""
name = name.replace("_", "-")
if name in cls.reasoning_parsers:
return cls.reasoning_parsers[name]

Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/reasoning/ernie_x1_reasoning_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager


@ReasoningParserManager.register_module("ernie_x1")
@ReasoningParserManager.register_module("ernie-x1")
class ErnieX1ReasoningParser(ReasoningParser):
"""
Reasoning parser for ernie_x1 model with stricter boundary checking.
Reasoning parser for ernie-x1 model with stricter boundary checking.

Unified rules:
- Do not strip newline before </think>
Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/worker/xpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def xpu_post_process(
step_idx = share_inputs["step_idx"]
limit_think_status = share_inputs["limit_think_status"]
if limit_strategy == "</think>":
# for ernie4_5_vl
# for ernie-45-vl
limit_thinking_content_length_v1(
sampled_token_ids,
max_think_lens,
Expand All @@ -212,7 +212,7 @@ def xpu_post_process(
think_end_id,
)
elif limit_strategy == "\n</think>\n\n":
# for ernie_x1
# for ernie-x1
assert line_break_id > 0
limit_thinking_content_length_v2(
sampled_token_ids,
Expand Down
8 changes: 4 additions & 4 deletions tests/entrypoints/openai/test_serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ def test_check_master_tp16_dp1_master(self):
self.assertTrue(serving_completion._check_master())

def test_calc_finish_reason_tool_calls(self):
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1"
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1"
engine_client = Mock()
engine_client.reasoning_parser = "ernie_x1"
engine_client.reasoning_parser = "ernie-x1"
# 创建一个OpenAIServingCompletion实例
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
# 创建一个模拟的output,并设置finish_reason为"tool_call"
Expand All @@ -86,9 +86,9 @@ def test_calc_finish_reason_tool_calls(self):
assert result == "tool_calls"

def test_calc_finish_reason_stop(self):
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1"
# 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1"
engine_client = Mock()
engine_client.reasoning_parser = "ernie_x1"
engine_client.reasoning_parser = "ernie-x1"
# 创建一个OpenAIServingCompletion实例
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
# 创建一个模拟的output,并设置finish_reason为其他值
Expand Down
Loading