sync #1493 to support TorchAllocator as TensorRT Gpu Allocator and fix DCNv2 tensorrt plugin error (#1519)

hanrui1sensetime · web-flow · commit 0b6b9ee69ff0 · 2022-12-13T19:20:59.000+08:00
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
@@ -49,16 +49,55 @@ nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone() con
   return plugin;
 }
 
+static const nvinfer1::IDimensionExpr *get_hw(const nvinfer1::IDimensionExpr *input,
+                                              const nvinfer1::IDimensionExpr *weight,
+                                              const nvinfer1::IDimensionExpr *stride,
+                                              const nvinfer1::IDimensionExpr *pad,
+                                              const nvinfer1::IDimensionExpr *dilation,
+                                              nvinfer1::IExprBuilder &exprBuilder) {
+  using DimOp = nvinfer1::DimensionOperation;
+  auto expr_1 = exprBuilder.constant(1);
+
+  // d*(w-1)+1
+  auto kernel_0 = exprBuilder.operation(DimOp::kSUB, *weight, *expr_1);
+  auto kernel_1 = exprBuilder.operation(DimOp::kPROD, *dilation, *kernel_0);
+  auto kernel = exprBuilder.operation(DimOp::kSUM, *kernel_1, *expr_1);
+
+  // (1+2*p-k)//stride -1
+  auto out_0 = exprBuilder.operation(DimOp::kSUM, *pad, *pad);
+  auto out_1 = exprBuilder.operation(DimOp::kSUM, *input, *out_0);
+  auto out_2 = exprBuilder.operation(DimOp::kSUB, *out_1, *kernel);
+  auto out_3 = exprBuilder.operation(DimOp::kFLOOR_DIV, *out_2, *stride);
+  auto out = exprBuilder.operation(DimOp::kSUM, *out_3, *expr_1);
+
+  return out;
+}
+
 nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
     nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  using DimOp = nvinfer1::DimensionOperation;
+  auto weight_dim = inputs[3].d;
   nvinfer1::DimsExprs ret;
   ret.nbDims = 4;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[3].d[0];
 
-  ret.d[2] = inputs[1].d[2];
-  ret.d[3] = inputs[1].d[3];
+  auto input_h = inputs[0].d[2];
+  auto input_w = inputs[0].d[3];
+  auto weight_h = weight_dim[2];
+  auto weight_w = weight_dim[3];
+  auto dilation_w = exprBuilder.constant(mDilation.d[0]);
+  auto dilation_h = exprBuilder.constant(mDilation.d[1]);
+  auto pad_w = exprBuilder.constant(mPadding.d[0]);
+  auto pad_h = exprBuilder.constant(mPadding.d[1]);
+  auto stride_w = exprBuilder.constant(mStride.d[0]);
+  auto stride_h = exprBuilder.constant(mStride.d[1]);
+  auto expr_1 = exprBuilder.constant(1);
+  auto expr_2 = exprBuilder.constant(2);
+
+  ret.d[2] = get_hw(input_h, weight_h, stride_h, pad_h, dilation_h, exprBuilder);
+  ret.d[3] = get_hw(input_w, weight_w, stride_w, pad_w, dilation_w, exprBuilder);
 
   return ret;
 }
@@ -224,11 +263,11 @@ nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
     }
     std::string field_name(fc->fields[i].name);
 
-    if (field_name.compare("deformable_group") == 0) {
+    if (field_name.compare("deform_groups") == 0) {
       deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
     }
 
-    if (field_name.compare("group") == 0) {
+    if (field_name.compare("groups") == 0) {
       group = static_cast<const int *>(fc->fields[i].data)[0];
     }
 
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
@@ -85,8 +85,8 @@ void ModulatedDeformConvForwardCUDAKernelLauncher(
   scalar_t* columns = (scalar_t*)workspace;
 
   const size_t input_step = channels * height * width;
-  const size_t offset_step = deformable_group * kernel_h * kernel_w * 2 * height * width;
-  const size_t mask_step = deformable_group * kernel_h * kernel_w * height * width;
+  const size_t offset_step = deformable_group * kernel_h * kernel_w * 2 * height_out * width_out;
+  const size_t mask_step = deformable_group * kernel_h * kernel_w * height_out * width_out;
   const size_t out_step = channels_out * height_out * width_out;
   const size_t out_group_step = out_step / group;
   const size_t col_g_step = channels * kernel_w * kernel_h / group * height_out * width_out;
diff --git a/mmdeploy/backend/tensorrt/__init__.py b/mmdeploy/backend/tensorrt/__init__.py
@@ -33,7 +33,9 @@ def is_custom_ops_available():
 
     try:
         # import wrapper if pytorch is available
+        from .torch_allocator import TorchAllocator
         from .wrapper import TRTWrapper
         __all__ += ['TRTWrapper']
+        __all__ += ['TorchAllocator', 'TRTWrapper']
     except Exception:
         pass
diff --git a/mmdeploy/backend/tensorrt/torch_allocator.py b/mmdeploy/backend/tensorrt/torch_allocator.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tensorrt as trt
+import torch
+
+from mmdeploy.utils import get_root_logger
+
+
+class TorchAllocator(trt.IGpuAllocator):
+    """PyTorch Cuda Allocator Wrapper."""
+
+    def __init__(self, device_id: int = 0) -> None:
+        super().__init__()
+
+        self.device_id = device_id
+        self.mems = set()
+
+    def __del__(self):
+        """destructor."""
+        mems = self.mems.copy()
+        (self.deallocate(mem) for mem in mems)
+
+    def allocate(self: trt.IGpuAllocator, size: int, alignment: int,
+                 flags: int) -> int:
+        """allocate gpu memory.
+
+        Args:
+            self (trt.IGpuAllocator): gpu allocator
+            size (int): memory size.
+            alignment (int): memory alignment.
+            flags (int): flags.
+
+        Returns:
+            int: memory address.
+        """
+        torch_stream = torch.cuda.current_stream(self.device_id)
+        logger = get_root_logger()
+        logger.debug(f'allocate {size} memory with TorchAllocator.')
+        assert alignment >= 0
+        if alignment > 0:
+            size = size | (alignment - 1) + 1
+        mem = torch.cuda.caching_allocator_alloc(
+            size, device=self.device_id, stream=torch_stream)
+        self.mems.add(mem)
+        return mem
+
+    def deallocate(self: trt.IGpuAllocator, memory: int) -> bool:
+        """deallocate memory.
+
+        Args:
+            self (trt.IGpuAllocator): gpu allocator
+            memory (int): memory address.
+
+        Returns:
+            bool: deallocate success.
+        """
+        logger = get_root_logger()
+        logger.debug(f'deallocate {memory} with TorchAllocator.')
+        if memory not in self.mems:
+            return False
+
+        torch.cuda.caching_allocator_delete(memory)
+        self.mems.discard(memory)
+        return True
diff --git a/mmdeploy/backend/tensorrt/utils.py b/mmdeploy/backend/tensorrt/utils.py
@@ -3,7 +3,7 @@
 import os
 import re
 import sys
-from typing import Dict, Optional, Sequence, Union
+from typing import Any, Dict, Optional, Sequence, Union
 
 import onnx
 import tensorrt as trt
@@ -24,17 +24,20 @@ def save(engine: trt.ICudaEngine, path: str) -> None:
         f.write(bytearray(engine.serialize()))
 
 
-def load(path: str) -> trt.ICudaEngine:
+def load(path: str, allocator: Optional[Any] = None) -> trt.ICudaEngine:
     """Deserialize TensorRT engine from disk.
 
     Args:
         path (str): The disk path to read the engine.
+        allocator (Any): gpu allocator
 
     Returns:
         tensorrt.ICudaEngine: The TensorRT engine loaded from disk.
     """
     load_tensorrt_plugin()
     with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+        if allocator is not None:
+            runtime.gpu_allocator = allocator
         with open(path, mode='rb') as f:
             engine_bytes = f.read()
         trt.init_libnvinfer_plugins(logger, namespace='')
@@ -148,6 +151,9 @@ def from_onnx(onnx_model: Union[str, onnx.ModelProto],
     # create builder and network
     logger = trt.Logger(log_level)
     builder = trt.Builder(logger)
+
+    # TODO: use TorchAllocator as builder.gpu_allocator
+
     EXPLICIT_BATCH = 1 << (int)(
         trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
     network = builder.create_network(EXPLICIT_BATCH)
diff --git a/mmdeploy/backend/tensorrt/wrapper.py b/mmdeploy/backend/tensorrt/wrapper.py
@@ -8,6 +8,7 @@
 from mmdeploy.utils.timer import TimeCounter
 from ..base import BACKEND_WRAPPER, BaseWrapper
 from .init_plugins import load_tensorrt_plugin
+from .torch_allocator import TorchAllocator
 from .utils import load
 
 
@@ -76,10 +77,12 @@ class TRTWrapper(BaseWrapper):
 
     def __init__(self,
                  engine: Union[str, trt.ICudaEngine],
-                 output_names: Optional[Sequence[str]] = None):
+                 output_names: Optional[Sequence[str]] = None,
+                 device_id: int = 0):
         super().__init__(output_names)
         load_tensorrt_plugin()
         self.engine = engine
+        self.allocator = TorchAllocator(device_id)
         if isinstance(self.engine, str):
             self.engine = load(engine)
 
@@ -90,6 +93,9 @@ def __init__(self,
         self._register_state_dict_hook(TRTWrapper.__on_state_dict)
         self.context = self.engine.create_execution_context()
 
+        if hasattr(self.context, 'temporary_allocator'):
+            self.context.temporary_allocator = self.allocator
+
         self.__load_io_names()
 
     def __load_io_names(self):