PaddlePaddle
diff --git a/‎benchmarks/backend_request_func.py‎
Lines changed: 5 additions & 1 deletion b/‎benchmarks/backend_request_func.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmarks/benchmark_dataset.py‎
Lines changed: 4 additions & 1 deletion b/‎benchmarks/benchmark_dataset.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎benchmarks/benchmark_serving.py‎
Lines changed: 7 additions & 1 deletion b/‎benchmarks/benchmark_serving.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml‎
Lines changed: 8 additions & 0 deletions b/‎benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/decoder_mla_attention_kernel.cu‎
Lines changed: 142 additions & 0 deletions b/‎custom_ops/gpu_ops/append_attn/decoder_mla_attention_kernel.cu‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/decoder_mla_attention_kernel.h‎
Lines changed: 39 additions & 0 deletions b/‎custom_ops/gpu_ops/append_attn/decoder_mla_attention_kernel.h‎
Lines changed: 39 additions & 0 deletions
@@ -51,6 +51,7 @@ class RequestFuncInput:
     ignore_eos: bool = False
     language: Optional[str] = None
     debug: bool = False
+    response_format: Optional[dict] = None
 
 
 @dataclass
@@ -93,8 +94,11 @@ async def async_request_eb_openai_chat_completions(
             "stream_options": {
                 "include_usage": True,
                 "continuous_usage_stats": True,
-            },
+            }
         }
+        if request_func_input.response_format:
+            payload["response_format"] =request_func_input.response_format
+
         # 超参由yaml传入
         payload.update(request_func_input.hyper_parameters)
 
 
@@ -45,7 +45,8 @@ class SampleRequest:
     json_data: Optional[dict]
     prompt_len: int
     expected_output_len: int
-
+    response_format: Optional[dict] = None
+    
 
 class BenchmarkDataset(ABC):
     """BenchmarkDataset"""
@@ -297,6 +298,7 @@ def sample(
             json_data = entry
             prompt = entry["messages"][-1].get("content", "")
             history_QA = entry.get("messages", [])
+            response_format = entry.get("response_format")
             new_output_len = int(entry.get("max_tokens", 12288))
 
             if enable_multimodal_chat:
@@ -309,6 +311,7 @@ def sample(
                     prompt_len=0,
                     history_QA=history_QA,
                     expected_output_len=new_output_len,
+                    response_format=response_format
                 )
             )
             cnt += 1
 
@@ -336,6 +336,7 @@ async def benchmark(
         input_requests[0].no,
     )
     test_history_QA = input_requests[0].history_QA
+    response_format = input_requests[0].response_format
 
     test_input = RequestFuncInput(
         model=model_id,
@@ -351,6 +352,7 @@ async def benchmark(
         ignore_eos=ignore_eos,
         debug=debug,
         extra_body=extra_body,
+        response_format=response_format
     )
 
     print("test_input:", test_input)
@@ -382,6 +384,7 @@ async def benchmark(
             logprobs=logprobs,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
+            response_format=response_format
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -420,6 +423,7 @@ async def limited_request_func(request_func_input, pbar):
             request.no,
         )
         history_QA = request.history_QA
+        response_format = request.response_format
 
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
@@ -440,6 +444,7 @@ async def limited_request_func(request_func_input, pbar):
             debug=debug,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
+            response_format=response_format
         )
         tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -455,6 +460,7 @@ async def limited_request_func(request_func_input, pbar):
             api_url=base_url + "/stop_profile",
             output_len=test_output_len,
             logprobs=logprobs,
+            response_format=response_format
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -982,7 +988,7 @@ def main(args: argparse.Namespace):
         if args.result_dir:
             file_name = os.path.join(args.result_dir, file_name)
         with open(file_name, "w", encoding="utf-8") as outfile:
-            json.dump(result_json, outfile)
+            json.dump(result_json, outfile, ensure_ascii=False)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
 
 
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.85
+tensor_parallel_size: 1
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+enable_mm: True
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.7
+metadata:
+  min_tokens: 1
+max_tokens: 32768
+repetition_penalty: 1.05
+frequency_penalty: 0
+presence_penalty: 0
@@ -1,5 +1,5 @@
-reasoning-parser: ernie_x1
-tool_call_parser: ernie_x1
+reasoning-parser: ernie-x1
+tool_call_parser: ernie-x1
 tensor_parallel_size: 4
 max_model_len: 65536
 max_num_seqs: 128
 
@@ -1,7 +1,7 @@
 tensor_parallel_size: 1
 max_model_len: 131072
 max_num_seqs: 32
-reasoning_parser: ernie_x1
-tool_call_parser: ernie_x1
+reasoning_parser: ernie-x1
+tool_call_parser: ernie-x1
 load_choices: "default_v1"
 quantization: wint8
@@ -0,0 +1,142 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "helper.h"
+#include "multiquery_decoder_attention_kernel.h"
+#include "utils.cuh"
+
+template <typename T>
+void DecodeMLAAttentionKernel(
+    const AppendAttnMetaData &meta_data,
+    const paddle::Tensor &q,  // [token_num, num_heads, head_dim]
+    const paddle::Tensor &cache_k,
+    const paddle::Tensor &cache_v,
+    const paddle::optional<paddle::Tensor> &attn_mask,
+    const paddle::optional<paddle::Tensor> &shift_bias,
+    const paddle::optional<paddle::Tensor> &smooth_weight,
+    const paddle::Tensor &seq_lens_q,  // q_seq_len is 1
+    const paddle::Tensor &seq_lens_kv,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
+    const paddle::Tensor &block_table,
+    int max_seq_len,
+    int max_dec_len,
+    float softmax_scale,
+    float in_scale,
+    bool causal,
+    cudaStream_t &stream,
+    paddle::Tensor *out) {
+  const auto token_num = meta_data.token_nums;
+  const auto block_size = meta_data.block_size;
+  const auto bsz = meta_data.batch_size;
+  const auto num_heads = meta_data.q_num_heads;
+  const auto group_size = meta_data.q_num_heads / meta_data.kv_num_heads;
+  const auto head_dim_qk = meta_data.head_dims;
+  const auto head_dim_v = meta_data.head_dims_v;
+  const float rope_scale = 0.0;
+  const float rope_theta = 0.0;
+  const uint32_t deal_each_time = get_cascade_attention_deal_each_time();
+  const uint32_t num_stage = get_cascade_attention_num_stages();
+  const uint32_t num_threads = get_cascade_attention_num_threads();
+
+  DISPATCH_CAUSAL(
+      causal,
+      CAUSAL,
+      {DISPATCH_MLA_GROUP_SIZE(
+          group_size,
+          GROUP_SIZE,
+          {DISPATCH_MLA_HEAD_DIM(
+              head_dim_qk,
+              HEAD_DIM_QK,
+              {DISPATCH_MLA_HEAD_DIM(
+                  head_dim_v,
+                  HEAD_DIM_V,
+                  {DISPATCH_BLOCK_SIZE(
+                      block_size,
+                      BLOCK_SIZE,
+                      {DISPATCH_DEAL_EACH_TIME(deal_each_time, DEAL_EACH_TIME, {
+                        MultiQueryDecoderAttention<T,
+                                                   GROUP_SIZE,
+                                                   HEAD_DIM_QK,
+                                                   HEAD_DIM_V,
+                                                   BLOCK_SIZE,
+                                                   CAUSAL,
+                                                   2,
+                                                   16,
+                                                   DEAL_EACH_TIME>(
+                            meta_data,
+                            stream,
+                            q,
+                            cache_k,
+                            cache_v,
+                            attn_mask,
+                            shift_bias,
+                            smooth_weight,
+                            seq_lens_q,
+                            seq_lens_kv,
+                            batch_id_per_token,
+                            cu_seqlens_q,
+                            block_table,
+                            max_seq_len,
+                            max_dec_len,
+                            rope_scale,
+                            rope_theta,
+                            softmax_scale,
+                            in_scale,
+                            out);
+                      })})})})})});
+}
+
+template void DecodeMLAAttentionKernel<paddle::bfloat16>(
+    const AppendAttnMetaData &meta_data,
+    const paddle::Tensor &q,  // [token_num, num_heads, head_dim]
+    const paddle::Tensor &cache_k,
+    const paddle::Tensor &cache_v,
+    const paddle::optional<paddle::Tensor> &attn_mask,
+    const paddle::optional<paddle::Tensor> &shift_bias,
+    const paddle::optional<paddle::Tensor> &smooth_weight,
+    const paddle::Tensor &seq_lens_q,  // q_seq_len is 1
+    const paddle::Tensor &seq_lens_kv,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
+    const paddle::Tensor &block_table,
+    int max_seq_len,
+    int max_dec_len,
+    float softmax_scale,
+    float in_scale,
+    bool causal,
+    cudaStream_t &stream,
+    paddle::Tensor *out);
+
+template void DecodeMLAAttentionKernel<paddle::float16>(
+    const AppendAttnMetaData &meta_data,
+    const paddle::Tensor &q,  // [token_num, num_heads, head_dim]
+    const paddle::Tensor &cache_k,
+    const paddle::Tensor &cache_v,
+    const paddle::optional<paddle::Tensor> &attn_mask,
+    const paddle::optional<paddle::Tensor> &shift_bias,
+    const paddle::optional<paddle::Tensor> &smooth_weight,
+    const paddle::Tensor &seq_lens_q,  // q_seq_len is 1
+    const paddle::Tensor &seq_lens_kv,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
+    const paddle::Tensor &block_table,
+    int max_seq_len,
+    int max_dec_len,
+    float softmax_scale,
+    float in_scale,
+    bool causal,
+    cudaStream_t &stream,
+    paddle::Tensor *out);
@@ -0,0 +1,39 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "helper.h"
+#include "utils.cuh"
+
+template <typename T>
+void DecodeMLAAttentionKernel(
+    const AppendAttnMetaData &meta_data,
+    const paddle::Tensor &q,  // [token_num, num_heads, head_dim]
+    const paddle::Tensor &cache_k,
+    const paddle::Tensor &cache_v,
+    const paddle::optional<paddle::Tensor> &attn_mask,
+    const paddle::optional<paddle::Tensor> &shift_bias,
+    const paddle::optional<paddle::Tensor> &smooth_weight,
+    const paddle::Tensor &seq_lens_q,  // q_seq_len is 1
+    const paddle::Tensor &seq_lens_kv,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
+    const paddle::Tensor &block_table,
+    int max_seq_len,
+    int max_dec_len,
+    float softmax_scale,
+    float in_scale,
+    bool causal,
+    cudaStream_t &stream,
+    paddle::Tensor *out);