diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
index b8cfdbf3ce098b..acb3b83bc983f3 100644
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h
new file mode 100644
index 00000000000000..8b47f70265a35f
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void MMHAKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& cache_kv,
+                const paddle::optional<DenseTensor>& bias,
+                const paddle::optional<DenseTensor>& src_mask,
+                const paddle::optional<DenseTensor>& cum_offsets,
+                const paddle::optional<DenseTensor>& sequence_lengths,
+                const paddle::optional<DenseTensor>& rotary_tensor,
+                const paddle::optional<DenseTensor>& beam_cache_offset,
+                const paddle::optional<DenseTensor>& qkv_out_scale,
+                const paddle::optional<DenseTensor>& out_shift,
+                const paddle::optional<DenseTensor>& out_smooth,
+                int seq_len,
+                int rotary_emb_dims,
+                const bool use_neox_rotary_style,
+                const std::string& compute_dtype,
+                const float out_scale,
+                const int quant_round_type,
+                const float quant_max_bound,
+                const float quant_min_bound,
+                DenseTensor* out,
+                DenseTensor* cache_kv_out,
+                DenseTensor* beam_cache_offset_out);
+
+}  // namespace fusion
+}  // namespace phi