lvhan028
diff --git a/‎csrc/mmdeploy/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎csrc/mmdeploy/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎csrc/mmdeploy/codebase/mmaction/cpu/format_shape_impl.cpp
Lines changed: 34 additions & 39 deletions b/‎csrc/mmdeploy/codebase/mmaction/cpu/format_shape_impl.cpp
Lines changed: 34 additions & 39 deletions
diff --git a/‎csrc/mmdeploy/codebase/mmaction/cuda/format_shape_impl.cpp
Lines changed: 69 additions & 45 deletions b/‎csrc/mmdeploy/codebase/mmaction/cuda/format_shape_impl.cpp
Lines changed: 69 additions & 45 deletions
@@ -13,6 +13,7 @@ if (MMDEPLOY_BUILD_SDK)
     add_subdirectory(device)
     add_subdirectory(graph)
     add_subdirectory(model)
+    add_subdirectory(operation)
     add_subdirectory(preprocess)
     add_subdirectory(net)
     add_subdirectory(codebase)
 
@@ -7,7 +7,8 @@ mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 add_subdirectory(cpu)
 add_subdirectory(cuda)
 target_link_libraries(${PROJECT_NAME} PRIVATE
-    mmdeploy::transform
+    mmdeploy_operation
+    mmdeploy_transform
     mmdeploy_opencv_utils)
 
 add_library(mmdeploy::mmaction ALIAS ${PROJECT_NAME})
 
@@ -5,69 +5,63 @@
 
 using namespace std;
 
-namespace mmdeploy {
-namespace cpu {
+namespace mmdeploy::mmaction::cpu {
 
-class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
+class FormatShapeImpl : public FormatShapeOp {
  public:
-  explicit FormatShapeImpl(const Value& args) : ::mmdeploy::FormatShapeImpl(args) {}
+  explicit FormatShapeImpl(std::string input_format) : FormatShapeOp(std::move(input_format)) {}
 
  protected:
-  Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len, int num_clips) {
-    int N = tensors.size();
-    int H = tensors[0].shape(1);
-    int W = tensors[0].shape(2);
-    int C = tensors[0].shape(3);
-
-    std::vector<Tensor> host_tensors;
-    host_tensors.reserve(N);
-    for (int i = 0; i < N; i++) {
-      OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensors[i], kHost, stream_));
-      host_tensors.push_back(std::move(src_tensor));
-    }
-    OUTCOME_TRY(stream_.Wait());
+  Result<void> apply(const std::vector<Tensor>& tensors, Tensor& output, int clip_len,
+                     int num_clips) override {
+    auto N = static_cast<int64_t>(tensors.size());
+    auto H = tensors[0].shape(1);
+    auto W = tensors[0].shape(2);
+    auto C = tensors[0].shape(3);
 
     TensorDesc desc = {kHost, DataType::kFLOAT, {N, H, W, C}};
     Tensor imgs(desc);
-    int offset = 0;
-    int n_item = H * W * C;
-    int copy_size = n_item * sizeof(float);
+    auto offset = 0UL;
+    auto n_item = H * W * C;
+    auto copy_size = n_item * sizeof(float);
     for (int i = 0; i < N; i++) {
-      auto src_buffer = host_tensors[i].buffer();
+      auto src_buffer = tensors[i].buffer();
       auto dst_buffer = imgs.buffer();
-      OUTCOME_TRY(stream_.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+      OUTCOME_TRY(stream().Copy(src_buffer, dst_buffer, copy_size, 0, offset));
       offset += copy_size;
     }
-    OUTCOME_TRY(stream_.Wait());
+
+    OUTCOME_TRY(stream().Wait());
 
     Tensor dst;
-    if (arg_.input_format == "NCHW") {
+    if (input_format_ == "NCHW") {
       OUTCOME_TRY(dst, FormatNCHW(imgs, clip_len, num_clips));
     }
-    if (arg_.input_format == "NCTHW") {
+    if (input_format_ == "NCTHW") {
       OUTCOME_TRY(dst, FormatNCTHW(imgs, clip_len, num_clips));
     }
     TensorShape expand_dim = dst.shape();
     expand_dim.insert(expand_dim.begin(), 1);
     dst.Reshape(expand_dim);
+    output = std::move(dst);
 
-    return dst;
+    return success();
   }
 
   Result<Tensor> FormatNCHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
     return Transpose(src, {N, H, W, C}, {0, 3, 1, 2});
   };
 
   Result<Tensor> FormatNCTHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
-    int L = clip_len;
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
+    auto L = clip_len;
     if (N % L != 0) {
       return Status(eInvalidArgument);
     }
@@ -77,7 +71,7 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
     return Transpose(src, {M, L, H, W, C}, {0, 4, 1, 2, 3});
   };
 
-  Result<Tensor> Transpose(Tensor& src, const std::vector<int>& src_dims,
+  Result<Tensor> Transpose(Tensor& src, const TensorShape& src_dims,
                            const std::vector<int>& permutation) {
     Tensor dst(src.desc());
     TensorShape shape(src.shape().size());
@@ -123,7 +117,8 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
   constexpr static Device kHost{0, 0};
 };
 
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::FormatShapeImpl, (cpu, 0), FormatShapeImpl);
+MMDEPLOY_REGISTER_FACTORY_FUNC(FormatShapeOp, (cpu, 0), [](std::string input_format) {
+  return std::make_unique<FormatShapeImpl>(std::move(input_format));
+});
 
-}  // namespace cpu
-}  // namespace mmdeploy
+}  // namespace mmdeploy::mmaction::cpu
@@ -1,28 +1,42 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "cuda_runtime.h"
+#include "cudnn.h"
 #include "mmdeploy/codebase/mmaction/format_shape.h"
 #include "mmdeploy/core/utils/device_utils.h"
 
 using namespace std;
 
-namespace mmdeploy {
-namespace cuda {
+namespace mmdeploy::mmaction::cuda {
 
-template <typename T>
-void Transpose(const T* src, const int* src_strides, T* dst, const int* dst_strides, int ndim,
-               int total, cudaStream_t stream);
+#define CUDNN_CHECK(condition)                                                 \
+  do {                                                                         \
+    if (condition != CUDNN_STATUS_SUCCESS) {                                   \
+      MMDEPLOY_ERROR("cudnn error, msg = {}", cudnnGetErrorString(condition)); \
+    }                                                                          \
+  } while (0);
 
-class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
+class FormatShapeImpl : public FormatShapeOp {
  public:
-  explicit FormatShapeImpl(const Value& args) : ::mmdeploy::FormatShapeImpl(args) {}
+  explicit FormatShapeImpl(std::string input_format) : FormatShapeOp(std::move(input_format)) {
+    CUDNN_CHECK(cudnnCreate(&handle_));
+    CUDNN_CHECK(cudnnSetStream(handle_, GetNative<cudaStream_t>(stream())));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&src_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&dst_desc_));
+  }
+
+  ~FormatShapeImpl() override {
+    CUDNN_CHECK(cudnnDestroy(handle_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(src_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(dst_desc_));
+  }
 
  protected:
-  Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len, int num_clips) {
-    int N = tensors.size();
-    int H = tensors[0].shape(1);
-    int W = tensors[0].shape(2);
-    int C = tensors[0].shape(3);
+  Result<void> apply(const std::vector<Tensor>& inputs, Tensor& output, int clip_len,
+                     int num_clips) override {
+    auto N = static_cast<int64_t>(inputs.size());
+    auto H = inputs[0].shape(1);
+    auto W = inputs[0].shape(2);
+    auto C = inputs[0].shape(3);
 
     auto t0 = std::chrono::high_resolution_clock::now();
     TensorDesc desc = {device_, DataType::kFLOAT, {N, H, W, C}};
@@ -31,39 +45,39 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
     int n_item = H * W * C;
     int copy_size = n_item * sizeof(float);
     for (int i = 0; i < N; i++) {
-      auto src_buffer = tensors[i].buffer();
+      auto src_buffer = inputs[i].buffer();
       auto dst_buffer = imgs.buffer();
-      OUTCOME_TRY(stream_.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+      OUTCOME_TRY(stream().Copy(src_buffer, dst_buffer, copy_size, 0, offset));
       offset += copy_size;
     }
 
-    Tensor dst;
-    if (arg_.input_format == "NCHW") {
-      OUTCOME_TRY(dst, FormatNCHW(imgs, clip_len, num_clips));
+    // Tensor dst;
+    if (input_format_ == "NCHW") {
+      OUTCOME_TRY(output, FormatNCHW(imgs, clip_len, num_clips));
     }
-    if (arg_.input_format == "NCTHW") {
-      OUTCOME_TRY(dst, FormatNCTHW(imgs, clip_len, num_clips));
+    if (input_format_ == "NCTHW") {
+      OUTCOME_TRY(output, FormatNCTHW(imgs, clip_len, num_clips));
     }
-    TensorShape expand_dim = dst.shape();
+    TensorShape expand_dim = output.shape();
     expand_dim.insert(expand_dim.begin(), 1);
-    dst.Reshape(expand_dim);
+    output.Reshape(expand_dim);
 
-    return dst;
+    return success();
   }
 
   Result<Tensor> FormatNCHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
     return Transpose(src, {N, H, W, C}, {0, 3, 1, 2});
   };
 
   Result<Tensor> FormatNCTHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
     int L = clip_len;
     if (N % L != 0) {
       return Status(eInvalidArgument);
@@ -74,7 +88,7 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
     return Transpose(src, {M, L, H, W, C}, {0, 4, 1, 2, 3});
   };
 
-  Result<Tensor> Transpose(Tensor& src, const std::vector<int>& src_dims,
+  Result<Tensor> Transpose(Tensor& src, const TensorShape& src_dims,
                            const std::vector<int>& permutation) {
     Tensor dst(src.desc());
     TensorShape shape(src.shape().size());
@@ -83,7 +97,15 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
     }
     dst.Reshape(shape);
 
-    int ndim = src_dims.size();
+    SetCudnnTensorDescriptor(src_dims, permutation);
+    CUDNN_CHECK(cudnnTransformTensor(handle_, &one_, src_desc_, src.data<float>(), &zero_,
+                                     dst_desc_, dst.data<float>()));
+
+    return dst;
+  }
+
+  void SetCudnnTensorDescriptor(const TensorShape& src_dims, const std::vector<int>& permutation) {
+    auto ndim = src_dims.size();
     std::vector<int> dst_dims(ndim);
     for (int i = 0; i < ndim; i++) {
       dst_dims[i] = src_dims[permutation[i]];
@@ -102,19 +124,21 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
       src_strides[i] = buffer[permutation[i]];
     }
 
-    Buffer _src_strides(Device("cuda"), sizeof(int) * ndim);
-    Buffer _dst_strides(Device("cuda"), sizeof(int) * ndim);
-    OUTCOME_TRY(stream_.Copy(src_strides.data(), _src_strides));
-    OUTCOME_TRY(stream_.Copy(dst_strides.data(), _dst_strides));
-
-    ::mmdeploy::cuda::Transpose(src.data<float>(), GetNative<int*>(_src_strides), dst.data<float>(),
-                                GetNative<int*>(_dst_strides), ndim, src.size(),
-                                (cudaStream_t)stream_.GetNative());
-    return dst;
+    CUDNN_CHECK(cudnnSetTensorNdDescriptor(src_desc_, CUDNN_DATA_FLOAT, ndim, dst_dims.data(),
+                                           src_strides.data()));
+    CUDNN_CHECK(cudnnSetTensorNdDescriptor(dst_desc_, CUDNN_DATA_FLOAT, ndim, dst_dims.data(),
+                                           dst_strides.data()));
   }
+
+  constexpr static float one_{1.0};
+  constexpr static float zero_{0.0};
+  cudnnHandle_t handle_{};
+  cudnnTensorDescriptor_t src_desc_{};
+  cudnnTensorDescriptor_t dst_desc_{};
 };
 
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::FormatShapeImpl, (cuda, 0), FormatShapeImpl);
+MMDEPLOY_REGISTER_FACTORY_FUNC(FormatShapeOp, (cuda, 0), [](std::string input_format) {
+  return std::make_unique<FormatShapeImpl>(std::move(input_format));
+});
 
-}  // namespace cuda
-}  // namespace mmdeploy
+}  // namespace mmdeploy::mmaction::cuda