Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/benchmark_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def main(args):
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")

for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
# Wramup
# Warmup
print("Starting warmup...")
with open(os.devnull, "w") as f:
with contextlib.redirect_stdout(f):
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ class CustomAllreduce {
bool full_nvlink_;

RankSignals sg_;
// Stores an map from a pointer to its peer pointters from all ranks.
// Stores an map from a pointer to its peer pointers from all ranks.
std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_;

Expand Down
2 changes: 1 addition & 1 deletion docs/get_started/installation/nvidia_gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ The following installation methods are available when your environment meets the

## 1. Pre-built Docker Installation (Recommended)

**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdpeloy-gpu``` after you create the container.
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdeploy-gpu``` after you create the container.

```shell
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.2.0
Expand Down
2 changes: 1 addition & 1 deletion docs/usage/code_overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ Below is an overview of the FastDeploy code structure and functionality organize
- ```platforms```: Platform-specific modules for underlying hardware support.
- ```scheduler```: Request scheduling module for large models.
- ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts).
- ```splitwise```: Modules related to PD disaggragation deployment.
- ```splitwise```: Modules related to PD disaggregation deployment.
- ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes).
- ```test```: Code for unit testing and validation.
2 changes: 1 addition & 1 deletion docs/usage/log.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ By default, logs are stored in the `log` directory under the execution path. To
* `cache_transfer_manager.log` : Logs startup parameters and received request information.
* `launch_cache_manager.log` : Records cache transfer startup parameters and error messages.

## PD Disaggragation Logs
## PD Disaggregation Logs
* `cache_messager.log` : Logs transmission protocols and messages used by the P instance.
* `splitwise_connector.log` : Records data received from P/D instances and connection establishment details.

Expand Down
8 changes: 4 additions & 4 deletions mkdocs.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
site_name: 'FastDeploy : Large Language Model Deployement'
site_name: 'FastDeploy : Large Language Model Deployment'
repo_url: https://github.com/PaddlePaddle/FastDeploy
repo_name: FastDeploy

Expand Down Expand Up @@ -36,7 +36,7 @@ plugins:
- locale: en
default: true
name: English
site_name: 'FastDeploy: Large Language Model Deployement'
site_name: 'FastDeploy: Large Language Model Deployment'
build: true
link: /FastDeploy/
- locale: zh
Expand All @@ -59,7 +59,7 @@ plugins:
ERNIE-4.5-VL-424B-A47B: ERNIE-4.5-VL-424B-A47B快速部署
Quick Deployment For QWEN: Qwen3-0.6b快速部署
Online Serving: 在线服务
OpenAI-Compitable API Server: 兼容 OpenAI 协议的服务化部署
OpenAI-Compatible API Server: 兼容 OpenAI 协议的服务化部署
Monitor Metrics: 监控Metrics
Scheduler: 调度器
Graceful Shutdown: 服务优雅关闭
Expand Down Expand Up @@ -114,7 +114,7 @@ nav:
- ERNIE-4.5-VL-424B-A47B: get_started/ernie-4.5-vl.md
- Quick Deployment For QWEN: get_started/quick_start_qwen.md
- Online Serving:
- OpenAI-Compitable API Server: online_serving/README.md
- OpenAI-Compatible API Server: online_serving/README.md
- Monitor Metrics: online_serving/metrics.md
- Scheduler: online_serving/scheduler.md
- Graceful Shutdown: online_serving/graceful_shutdown_service.md
Expand Down
50 changes: 17 additions & 33 deletions tests/ce/server/test_return_token_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,7 @@

import json

from core import (
TEMPLATE,
URL,
build_request_payload,
send_request,
)
from core import TEMPLATE, URL, build_request_payload, send_request

COMPLETIONS_URL = URL.replace("/v1/chat/completions", "/v1/completions")

Expand All @@ -29,17 +24,17 @@ def test_completion_stream_text_after_process_raw_prediction():
"stream": True,
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
"max_tokens": 50,
"return_token_ids": True
"return_token_ids": True,
}

payload = build_request_payload(TEMPLATE, data)
resp = send_request(COMPLETIONS_URL, payload, stream=True)
for line in resp.iter_lines(decode_unicode=True):
if line.strip() == "data: [DONE]":
break
if line.strip() == "" or not line.startswith("data: "):
continue
line = line[len("data: "):]
line = line[len("data: ") :]
response_data = json.loads(line)

choice = response_data["choices"][0]
Expand All @@ -51,21 +46,16 @@ def test_completion_stream_text_after_process_raw_prediction():
reasoning_content = choice["reasoning_content"]
text = choice["text"]
assert reasoning_content or text in raw_prediction, "raw_prediction取值结果不正确"
if "finish_reason" in line.strip() :
if "finish_reason" in line.strip():
break

def test_completion_text_after_process_raw_predictio_return_tokrn_ids():

def test_completion_text_after_process_raw_predictio_return_token_ids():
"""
/v1/completions接口,非流式接口
返回属性"text_after_process"和"reasoning_content"
"""
data = {
"stream": False,
"prompt": "你是谁",
"max_tokens": 50,
"return_token_ids": True
}
data = {"stream": False, "prompt": "你是谁", "max_tokens": 50, "return_token_ids": True}
payload = build_request_payload(TEMPLATE, data)
resp = send_request(COMPLETIONS_URL, payload).json()

Expand All @@ -80,14 +70,10 @@ def test_completion_text_after_process_raw_predictio_return_tokrn_ids():

def test_completion_text_after_process_raw_prediction():
"""
/v1/completions接口,无return_tokrn_ids参数
/v1/completions接口,无return_token_ids参数
非流式接口中,无return token ids 属性"text_after_process"和"reasoning_content"值为null
"""
data = {
"stream": False,
"prompt": "你是谁",
"max_tokens": 50
}
data = {"stream": False, "prompt": "你是谁", "max_tokens": 50}
payload = build_request_payload(TEMPLATE, data)
resp = send_request(COMPLETIONS_URL, payload).json()

Expand All @@ -108,17 +94,17 @@ def test_stream_text_after_process_raw_prediction():
"stream": True,
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
"max_tokens": 50,
"return_token_ids": True
"return_token_ids": True,
}

payload = build_request_payload(TEMPLATE, data)
resp = send_request(URL, payload, stream=True)
for line in resp.iter_lines(decode_unicode=True):
if line.strip() == "data: [DONE]" :
if line.strip() == "data: [DONE]":
break
if line.strip() == "" or not line.startswith("data: "):
continue
line = line[len("data: "):]
line = line[len("data: ") :]
response_data = json.loads(line)

choice = response_data["choices"][0]
Expand All @@ -130,11 +116,11 @@ def test_stream_text_after_process_raw_prediction():
reasoning_content = choice["delta"]["reasoning_content"]
content = choice["delta"]["content"]
assert reasoning_content or content in raw_prediction, "raw_prediction取值结果不正确"
if "finish_reason" in line.strip() :
if "finish_reason" in line.strip():
break

def test_text_after_process_raw_prediction_return_tokrn_ids():

def test_text_after_process_raw_prediction_return_token_ids():
"""
/v1/chat/completions接口,非流式接口
返回属性"text_after_process"和"reasoning_content"
Expand All @@ -161,7 +147,7 @@ def test_text_after_process_raw_prediction_return_tokrn_ids():

def test_text_after_process_raw_prediction():
"""
/v1/chat/completions接口,无return_tokrn_ids参数
/v1/chat/completions接口,无return_token_ids参数
无return token ids 属性"text_after_process"和"reasoning_content"值为null
"""
data = {
Expand All @@ -179,5 +165,3 @@ def test_text_after_process_raw_prediction():

raw_prediction = resp["choices"][0]["message"]["raw_prediction"]
assert raw_prediction is None, "raw_prediction取值结果不正确"


4 changes: 2 additions & 2 deletions tests/ci_use/EB_Lite_with_adapter/zmq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def consume_results(self, result_queue):
if self.need_exit:
break
except Exception as e:
print(f"zmq client occured error {e} type: {type(e)} frames: {frames}")
print(f"zmq client occurred error {e} type: {type(e)} frames: {frames}")

def start(self, result_queue):
threading.Thread(target=self.consume_results, args=(result_queue,), daemon=True).start()
Expand Down Expand Up @@ -118,4 +118,4 @@ def recv_results(self):
self.result[task_id] = result["result"]
self.task_event[task_id].set()
except Exception as e:
print(f"zmq client occured error {e} type: {type(e)} frames: {frames}")
print(f"zmq client occurred error {e} type: {type(e)} frames: {frames}")
10 changes: 5 additions & 5 deletions tests/distributed/custom_all_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ def test_case(self):
fa = CustomAllreduce(model_parallel_group)

for m, n in mns:
data_cusom_ar = paddle.rand([m, n], dtype="bfloat16")
data_paddle = data_cusom_ar.clone()
if fa.should_custom_ar(data_cusom_ar):
fa.custom_all_reduce(data_cusom_ar)
data_custom_ar = paddle.rand([m, n], dtype="bfloat16")
data_paddle = data_custom_ar.clone()
if fa.should_custom_ar(data_custom_ar):
fa.custom_all_reduce(data_custom_ar)
dist.all_reduce(data_paddle)
if dist.get_rank() == 0:
np.testing.assert_allclose(
data_cusom_ar.numpy(),
data_custom_ar.numpy(),
data_paddle.numpy(),
rtol=1e-04,
atol=1e-04,
Expand Down
2 changes: 1 addition & 1 deletion tools/deep_gemm_pre-compile/pre_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def pre_compile_from_config(config_file: str, num_threads: int, expert_parallel:

pbar.close()

logger.info(f"Total compliation time: {time() - start_time:.2f} seconds")
logger.info(f"Total compilation time: {time() - start_time:.2f} seconds")


def main(args):
Expand Down
Loading