diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu index b8cfdbf3ce098b..acb3b83bc983f3 100644 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h new file mode 100644 index 00000000000000..8b47f70265a35f --- /dev/null +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h @@ -0,0 +1,49 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace fusion { + +template +void MMHAKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& cache_kv, + const paddle::optional& bias, + const paddle::optional& src_mask, + const paddle::optional& cum_offsets, + const paddle::optional& sequence_lengths, + const paddle::optional& rotary_tensor, + const paddle::optional& beam_cache_offset, + const paddle::optional& qkv_out_scale, + const paddle::optional& out_shift, + const paddle::optional& out_smooth, + int seq_len, + int rotary_emb_dims, + const bool use_neox_rotary_style, + const std::string& compute_dtype, + const float out_scale, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + DenseTensor* out, + DenseTensor* cache_kv_out, + DenseTensor* beam_cache_offset_out); + +} // namespace fusion +} // namespace phi