[CustomDevice] adapt c_embedding to phi namespace for custom devices

SylarTiaNII · SylarTiaNII · commit 552df54f59f5 · 2024-01-12T17:06:17.000+08:00
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -242,7 +242,8 @@ set(cc_search_pattern
     "strings/cpu/*.cc"
     "fusion/*.cc"
     "stride/*.cc"
-    "fusion/cpu/*.cc")
+    "fusion/cpu/*.cc"
+    "custom/*.cc")
 
 if(WITH_MKLDNN)
   set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc"
diff --git a/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/c_embedding_grad_kernel.h"
+#include "glog/logging.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CEmbeddingGradKernel(const Context& dev_ctx,
+                          const DenseTensor& w,
+                          const DenseTensor& ids,
+                          const DenseTensor& out_grad,
+                          int64_t start_index,
+                          DenseTensor* w_grad) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  w_grad->Resize(w.dims());
+  dev_ctx.template Alloc(w_grad, w.dtype());
+  const auto& index_type = ids.dtype();
+  if (index_type == phi::DataType::INT32 ||
+      index_type == phi::DataType::INT64) {
+    auto K = ids.numel();
+    auto N = w.dims()[0];
+    auto D = w.dims()[1];
+
+    auto x_tmp = std::make_shared<phi::DenseTensor>();
+    x_tmp->ShareDataWith(ids).Resize({K});
+    auto w_tmp = std::make_shared<phi::DenseTensor>();
+    w_tmp->set_meta(w.meta());
+    dev_ctx.Alloc(w_tmp.get(), w_tmp->dtype());
+    auto out_grad_tmp = std::make_shared<phi::DenseTensor>();
+    out_grad_tmp->ShareDataWith(out_grad).Resize({K, D});
+    paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp),
+        out_grad_tensor(out_grad_tmp);
+
+    auto start_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index, x_tensor.dtype(), x_tensor.place());
+    auto end_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place());
+    auto ids_mask_tensor = paddle::experimental::logical_and(
+        x_tensor.greater_equal(start_index_tensor),
+        x_tensor.less_than(end_index_tensor));
+    auto real_ids_tensor = (x_tensor - start_index_tensor)
+                               .multiply(paddle::experimental::cast(
+                                   ids_mask_tensor, x_tensor.dtype()));
+    auto out_grad_tensor_mul_mask =
+        paddle::experimental::reshape(out_grad_tensor, {K, D})
+            .multiply(paddle::experimental::reshape(
+                paddle::experimental::cast(ids_mask_tensor, w.dtype()),
+                {K, 1}));
+    paddle::Tensor w_grad_tensor;
+    paddle::experimental::embedding_grad(real_ids_tensor,
+                                         w_tensor,
+                                         out_grad_tensor_mul_mask,
+                                         -1,
+                                         false,
+                                         &w_grad_tensor);
+    w_grad->ShareDataWith(
+        *reinterpret_cast<phi::DenseTensor*>(w_grad_tensor.impl().get()));
+
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Custom Device c_embedding_grad ids only support int32 or int64."));
+  }
+#else
+  PADDLE_THROW(
+      phi::errors::Unavailable("This kernel can only be functional when paddle "
+                               "is compiled with custom device."));
+#endif
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_embedding_grad,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CEmbeddingGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/custom/c_embedding_kernel.cc b/paddle/phi/kernels/custom/c_embedding_kernel.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/c_embedding_kernel.h"
+#include "glog/logging.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CEmbeddingKernel(const Context& dev_ctx,
+                      const DenseTensor& w,
+                      const DenseTensor& ids,
+                      int64_t start_index,
+                      int64_t vocab_size,
+                      DenseTensor* out) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  const auto& index_type = ids.dtype();
+  if (index_type == phi::DataType::INT32 ||
+      index_type == phi::DataType::INT64) {
+    auto out_dims = out->dims();
+    auto K = ids.numel();
+    auto N = w.dims()[0];
+    auto D = w.dims()[1];
+
+    auto x_tmp = std::make_shared<phi::DenseTensor>();
+    x_tmp->ShareDataWith(ids).Resize({K});
+    auto w_tmp = std::make_shared<phi::DenseTensor>();
+    w_tmp->ShareDataWith(w).Resize({N, D});
+    paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp);
+
+    auto start_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index, x_tensor.dtype(), x_tensor.place());
+    auto end_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place());
+    auto ids_mask_tensor = paddle::experimental::logical_and(
+        x_tensor.greater_equal(start_index_tensor),
+        x_tensor.less_than(end_index_tensor));
+    auto ids_tensor = (x_tensor - start_index_tensor)
+                          .multiply(paddle::experimental::cast(
+                              ids_mask_tensor, x_tensor.dtype()));
+    auto out_tensor =
+        paddle::experimental::reshape(
+            paddle::experimental::cast(ids_mask_tensor, w_tensor.dtype()),
+            {K, 1})
+            .multiply(paddle::experimental::reshape(
+                paddle::experimental::embedding(
+                    ids_tensor, w_tensor, -1, false),
+                {K, D}));
+    out->ShareDataWith(
+           *reinterpret_cast<phi::DenseTensor*>(out_tensor.impl().get()))
+        .Resize(out_dims);
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Custom Device c_embedding ids only support int32 or int64."));
+  }
+#else
+  PADDLE_THROW(
+      phi::errors::Unavailable("This kernel can only be functional when paddle "
+                               "is compiled with custom device."));
+#endif
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_embedding,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CEmbeddingKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/test/legacy_test/c_embedding_op_base.py b/test/legacy_test/c_embedding_op_base.py
@@ -87,12 +87,16 @@ def test_check_output(self):
             self.check_output_with_place(core.CUDAPlace(0))
         elif core.is_compiled_with_xpu():
             self.check_output_with_place(core.XPUPlace(0))
+        elif core.is_compiled_with_custom_device():
+            self.check_output_with_place(core.CustomPlace(0))
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
         elif core.is_compiled_with_xpu():
             self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out')
+        elif core.is_compiled_with_custom_device():
+            self.check_grad_with_place(core.CustomPlace(0), ['W'], 'Out')
 
     def init_dtype(self):
         if core.is_compiled_with_cuda():
@@ -101,6 +105,9 @@ def init_dtype(self):
         elif core.is_compiled_with_xpu():
             self.dtype = "float32"
             self.ids_dtype = "int64"
+        elif core.is_compiled_with_custom_device():
+            self.dtype = "float32"
+            self.ids_dtype = "int64"
 
 
 class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):