Skip to content

Commit 4d1f48e

Browse files
authored
Merge branch 'develop' into ut
2 parents 91d198c + 3cbca75 commit 4d1f48e

File tree

94 files changed

+7609
-2788
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+7609
-2788
lines changed

benchmarks/backend_request_func.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class RequestFuncInput:
5151
ignore_eos: bool = False
5252
language: Optional[str] = None
5353
debug: bool = False
54+
response_format: Optional[dict] = None
5455

5556

5657
@dataclass
@@ -93,8 +94,11 @@ async def async_request_eb_openai_chat_completions(
9394
"stream_options": {
9495
"include_usage": True,
9596
"continuous_usage_stats": True,
96-
},
97+
}
9798
}
99+
if request_func_input.response_format:
100+
payload["response_format"] =request_func_input.response_format
101+
98102
# 超参由yaml传入
99103
payload.update(request_func_input.hyper_parameters)
100104

benchmarks/benchmark_dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ class SampleRequest:
4545
json_data: Optional[dict]
4646
prompt_len: int
4747
expected_output_len: int
48-
48+
response_format: Optional[dict] = None
49+
4950

5051
class BenchmarkDataset(ABC):
5152
"""BenchmarkDataset"""
@@ -297,6 +298,7 @@ def sample(
297298
json_data = entry
298299
prompt = entry["messages"][-1].get("content", "")
299300
history_QA = entry.get("messages", [])
301+
response_format = entry.get("response_format")
300302
new_output_len = int(entry.get("max_tokens", 12288))
301303

302304
if enable_multimodal_chat:
@@ -309,6 +311,7 @@ def sample(
309311
prompt_len=0,
310312
history_QA=history_QA,
311313
expected_output_len=new_output_len,
314+
response_format=response_format
312315
)
313316
)
314317
cnt += 1

benchmarks/benchmark_serving.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ async def benchmark(
336336
input_requests[0].no,
337337
)
338338
test_history_QA = input_requests[0].history_QA
339+
response_format = input_requests[0].response_format
339340

340341
test_input = RequestFuncInput(
341342
model=model_id,
@@ -351,6 +352,7 @@ async def benchmark(
351352
ignore_eos=ignore_eos,
352353
debug=debug,
353354
extra_body=extra_body,
355+
response_format=response_format
354356
)
355357

356358
print("test_input:", test_input)
@@ -382,6 +384,7 @@ async def benchmark(
382384
logprobs=logprobs,
383385
ignore_eos=ignore_eos,
384386
extra_body=extra_body,
387+
response_format=response_format
385388
)
386389
profile_output = await request_func(request_func_input=profile_input)
387390
if profile_output.success:
@@ -420,6 +423,7 @@ async def limited_request_func(request_func_input, pbar):
420423
request.no,
421424
)
422425
history_QA = request.history_QA
426+
response_format = request.response_format
423427

424428
req_model_id, req_model_name = model_id, model_name
425429
if lora_modules:
@@ -440,6 +444,7 @@ async def limited_request_func(request_func_input, pbar):
440444
debug=debug,
441445
ignore_eos=ignore_eos,
442446
extra_body=extra_body,
447+
response_format=response_format
443448
)
444449
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
445450
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -455,6 +460,7 @@ async def limited_request_func(request_func_input, pbar):
455460
api_url=base_url + "/stop_profile",
456461
output_len=test_output_len,
457462
logprobs=logprobs,
463+
response_format=response_format
458464
)
459465
profile_output = await request_func(request_func_input=profile_input)
460466
if profile_output.success:
@@ -982,7 +988,7 @@ def main(args: argparse.Namespace):
982988
if args.result_dir:
983989
file_name = os.path.join(args.result_dir, file_name)
984990
with open(file_name, "w", encoding="utf-8") as outfile:
985-
json.dump(result_json, outfile)
991+
json.dump(result_json, outfile, ensure_ascii=False)
986992
save_to_pytorch_benchmark_format(args, result_json, file_name)
987993

988994

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
max_model_len: 32768
2+
max_num_seqs: 128
3+
gpu_memory_utilization: 0.85
4+
tensor_parallel_size: 1
5+
limit_mm_per_prompt: '{"image": 100, "video": 100}'
6+
enable_mm: True
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
top_p: 0.8
2+
temperature: 0.7
3+
metadata:
4+
min_tokens: 1
5+
max_tokens: 32768
6+
repetition_penalty: 1.05
7+
frequency_penalty: 0
8+
presence_penalty: 0

benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
reasoning-parser: ernie_x1
2-
tool_call_parser: ernie_x1
1+
reasoning-parser: ernie-x1
2+
tool_call_parser: ernie-x1
33
tensor_parallel_size: 4
44
max_model_len: 65536
55
max_num_seqs: 128
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
tensor_parallel_size: 1
22
max_model_len: 131072
33
max_num_seqs: 32
4-
reasoning_parser: ernie_x1
5-
tool_call_parser: ernie_x1
4+
reasoning_parser: ernie-x1
5+
tool_call_parser: ernie-x1
66
load_choices: "default_v1"
77
quantization: wint8
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
#pragma once
15+
16+
#include "helper.h"
17+
#include "multiquery_decoder_attention_kernel.h"
18+
#include "utils.cuh"
19+
20+
template <typename T>
21+
void DecodeMLAAttentionKernel(
22+
const AppendAttnMetaData &meta_data,
23+
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
24+
const paddle::Tensor &cache_k,
25+
const paddle::Tensor &cache_v,
26+
const paddle::optional<paddle::Tensor> &attn_mask,
27+
const paddle::optional<paddle::Tensor> &shift_bias,
28+
const paddle::optional<paddle::Tensor> &smooth_weight,
29+
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
30+
const paddle::Tensor &seq_lens_kv,
31+
const paddle::Tensor &batch_id_per_token,
32+
const paddle::Tensor &cu_seqlens_q,
33+
const paddle::Tensor &block_table,
34+
int max_seq_len,
35+
int max_dec_len,
36+
float softmax_scale,
37+
float in_scale,
38+
bool causal,
39+
cudaStream_t &stream,
40+
paddle::Tensor *out) {
41+
const auto token_num = meta_data.token_nums;
42+
const auto block_size = meta_data.block_size;
43+
const auto bsz = meta_data.batch_size;
44+
const auto num_heads = meta_data.q_num_heads;
45+
const auto group_size = meta_data.q_num_heads / meta_data.kv_num_heads;
46+
const auto head_dim_qk = meta_data.head_dims;
47+
const auto head_dim_v = meta_data.head_dims_v;
48+
const float rope_scale = 0.0;
49+
const float rope_theta = 0.0;
50+
const uint32_t deal_each_time = get_cascade_attention_deal_each_time();
51+
const uint32_t num_stage = get_cascade_attention_num_stages();
52+
const uint32_t num_threads = get_cascade_attention_num_threads();
53+
54+
DISPATCH_CAUSAL(
55+
causal,
56+
CAUSAL,
57+
{DISPATCH_MLA_GROUP_SIZE(
58+
group_size,
59+
GROUP_SIZE,
60+
{DISPATCH_MLA_HEAD_DIM(
61+
head_dim_qk,
62+
HEAD_DIM_QK,
63+
{DISPATCH_MLA_HEAD_DIM(
64+
head_dim_v,
65+
HEAD_DIM_V,
66+
{DISPATCH_BLOCK_SIZE(
67+
block_size,
68+
BLOCK_SIZE,
69+
{DISPATCH_DEAL_EACH_TIME(deal_each_time, DEAL_EACH_TIME, {
70+
MultiQueryDecoderAttention<T,
71+
GROUP_SIZE,
72+
HEAD_DIM_QK,
73+
HEAD_DIM_V,
74+
BLOCK_SIZE,
75+
CAUSAL,
76+
2,
77+
16,
78+
DEAL_EACH_TIME>(
79+
meta_data,
80+
stream,
81+
q,
82+
cache_k,
83+
cache_v,
84+
attn_mask,
85+
shift_bias,
86+
smooth_weight,
87+
seq_lens_q,
88+
seq_lens_kv,
89+
batch_id_per_token,
90+
cu_seqlens_q,
91+
block_table,
92+
max_seq_len,
93+
max_dec_len,
94+
rope_scale,
95+
rope_theta,
96+
softmax_scale,
97+
in_scale,
98+
out);
99+
})})})})})});
100+
}
101+
102+
template void DecodeMLAAttentionKernel<paddle::bfloat16>(
103+
const AppendAttnMetaData &meta_data,
104+
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
105+
const paddle::Tensor &cache_k,
106+
const paddle::Tensor &cache_v,
107+
const paddle::optional<paddle::Tensor> &attn_mask,
108+
const paddle::optional<paddle::Tensor> &shift_bias,
109+
const paddle::optional<paddle::Tensor> &smooth_weight,
110+
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
111+
const paddle::Tensor &seq_lens_kv,
112+
const paddle::Tensor &batch_id_per_token,
113+
const paddle::Tensor &cu_seqlens_q,
114+
const paddle::Tensor &block_table,
115+
int max_seq_len,
116+
int max_dec_len,
117+
float softmax_scale,
118+
float in_scale,
119+
bool causal,
120+
cudaStream_t &stream,
121+
paddle::Tensor *out);
122+
123+
template void DecodeMLAAttentionKernel<paddle::float16>(
124+
const AppendAttnMetaData &meta_data,
125+
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
126+
const paddle::Tensor &cache_k,
127+
const paddle::Tensor &cache_v,
128+
const paddle::optional<paddle::Tensor> &attn_mask,
129+
const paddle::optional<paddle::Tensor> &shift_bias,
130+
const paddle::optional<paddle::Tensor> &smooth_weight,
131+
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
132+
const paddle::Tensor &seq_lens_kv,
133+
const paddle::Tensor &batch_id_per_token,
134+
const paddle::Tensor &cu_seqlens_q,
135+
const paddle::Tensor &block_table,
136+
int max_seq_len,
137+
int max_dec_len,
138+
float softmax_scale,
139+
float in_scale,
140+
bool causal,
141+
cudaStream_t &stream,
142+
paddle::Tensor *out);
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
#pragma once
15+
16+
#include "helper.h"
17+
#include "utils.cuh"
18+
19+
template <typename T>
20+
void DecodeMLAAttentionKernel(
21+
const AppendAttnMetaData &meta_data,
22+
const paddle::Tensor &q, // [token_num, num_heads, head_dim]
23+
const paddle::Tensor &cache_k,
24+
const paddle::Tensor &cache_v,
25+
const paddle::optional<paddle::Tensor> &attn_mask,
26+
const paddle::optional<paddle::Tensor> &shift_bias,
27+
const paddle::optional<paddle::Tensor> &smooth_weight,
28+
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
29+
const paddle::Tensor &seq_lens_kv,
30+
const paddle::Tensor &batch_id_per_token,
31+
const paddle::Tensor &cu_seqlens_q,
32+
const paddle::Tensor &block_table,
33+
int max_seq_len,
34+
int max_dec_len,
35+
float softmax_scale,
36+
float in_scale,
37+
bool causal,
38+
cudaStream_t &stream,
39+
paddle::Tensor *out);

0 commit comments

Comments
 (0)