fix template

cxiang26 · cxiang26 · commit 16d7c1e82bf7 · 2023-03-14T20:31:51.000+08:00
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
@@ -21,9 +21,10 @@ void ms_deformable_im2col_cuda(cudaStream_t stream, scalar_t const* dataValue, i
         spatialSize, numHeads, channels, numLevels, numQuery, numPoint, dataCol);
 }
 
-template <>
-int32_t ms_deform_attn_cuda_forward<float>(const float* value, const int32_t* spatialShapes,
-    const int32_t* levelStartIndex, const float* samplingLoc, const float* attnWeight, float* output, int32_t batch,
+
+template <typename scalar_t>
+int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes,
+    const int32_t* levelStartIndex, const scalar_t* samplingLoc, const scalar_t* attnWeight, scalar_t* output, int32_t batch,
     int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint,
     cudaStream_t stream)
 {
@@ -37,33 +38,20 @@ int32_t ms_deform_attn_cuda_forward<float>(const float* value, const int32_t* sp
     for (int32_t n = 0; n < batch / mIm2colStep; ++n)
     {
         auto columns = output +  n * mIm2colStep * perOutputSize;
-        ms_deformable_im2col_cuda<float>(stream, value + n * mIm2colStep * perValueSize, spatialShapes, levelStartIndex,
+        ms_deformable_im2col_cuda<scalar_t>(stream, value + n * mIm2colStep * perValueSize, spatialShapes, levelStartIndex,
             samplingLoc + n * mIm2colStep * perSampleLocSize, attnWeight + n * mIm2colStep * perAttnWeightSize, mIm2colStep,
             mSpatialSize, mNumHeads, mChannels, mNumLevels, mNumQuery, mNumPoint, columns);
     }
 
     return 0;
 }
 
-template <>
-int32_t ms_deform_attn_cuda_forward<__half>(const __half* value, const int32_t* spatialShapes,
-    const int32_t* levelStartIndex, const __half* samplingLoc, const __half* attnWeight, __half* output, int32_t batch,
+template int32_t ms_deform_attn_cuda_forward<float>(const float* value, const int32_t* spatialShapes,
+    const int32_t* levelStartIndex, const float* samplingLoc, const float* attnWeight, float* output, int32_t batch,
     int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint,
-    cudaStream_t stream)
-{
-    auto perValueSize = mSpatialSize * mNumHeads * mChannels;
-    auto perSampleLocSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint * 2;
-    auto perAttnWeightSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint;
-    auto perOutputSize = mNumQuery * mNumHeads * mChannels;
+    cudaStream_t stream);
 
-    int32_t mIm2colStep = batch;
-    for (int32_t n = 0; n < batch / mIm2colStep; ++n)
-    {
-        auto columns = output + n * mIm2colStep * perOutputSize;
-        ms_deformable_im2col_cuda<__half>(stream, value + n * mIm2colStep * perValueSize, spatialShapes, levelStartIndex,
-            samplingLoc + n * mIm2colStep * perSampleLocSize, attnWeight + n * mIm2colStep * perAttnWeightSize, mIm2colStep,
-            mSpatialSize, mNumHeads, mChannels, mNumLevels, mNumQuery, mNumPoint, columns);
-    }
-
-    return 0;
-}
+template int32_t ms_deform_attn_cuda_forward<__half>(const __half* value, const int32_t* spatialShapes,
+    const int32_t* levelStartIndex, const __half* samplingLoc, const __half* attnWeight, __half* output, int32_t batch,
+    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint,
+    cudaStream_t stream);