feat: Utilize non-default stream for runtimes

gs-olive · gs-olive · commit 62e7b82a3ac6 · 2024-06-28T00:03:56.000-07:00
- Add support for non-default streams
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -2,6 +2,7 @@
 
 #include <cuda_runtime.h>
 #include "NvInfer.h"
+#include "c10/cuda/CUDAStream.h"
 #include "torch/csrc/jit/frontend/function_schema_parser.h"
 #include "torch/cuda.h"
 
@@ -66,6 +67,10 @@ TRTEngine::TRTEngine(
   multi_gpu_device_check();
   set_rt_device(device_info);
 
+  // Set active stream to high-priority, non-default stream
+  active_stream = c10::cuda::getStreamFromPool(true, device_info.id);
+  c10::cuda::setCurrentCUDAStream(active_stream);
+
   rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));
 
   name = slugify(mod_name);
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -9,6 +9,7 @@
 #include "ATen/core/function_schema.h"
 #include "ATen/cuda/CUDAGraph.h"
 #include "NvInfer.h"
+#include "c10/cuda/CUDAStream.h"
 #include "torch/custom_class.h"
 
 #include "core/runtime/TRTEngineProfiler.h"
@@ -65,6 +66,7 @@ struct TRTEngine : torch::CustomClassHolder {
 
   // CUDAGraph-Related Functionality
   at::cuda::CUDAGraph cudagraph = {};
+  at::cuda::CUDAStream active_stream = c10::cuda::getDefaultCUDAStream();
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key;
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -1,4 +1,3 @@
-#include "c10/cuda/CUDAGuard.h"
 #include "c10/cuda/CUDAStream.h"
 
 #include "torch/csrc/jit/runtime/custom_operator.h"
@@ -64,9 +63,20 @@ bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_
   // invalidate the existing cudagraphs object
 
   // Populate the shape key for the inputs
+  // x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
   std::stringstream new_shape_key_ss;
   for (auto input : inputs) {
-    new_shape_key_ss << input.sizes();
+    new_shape_key_ss << "(";
+    auto sizes = input.sizes();
+    auto rank = input.sizes().size();
+    for (auto i = 0; i < rank; i++) {
+      new_shape_key_ss << sizes[i];
+      // For all but the final dimension in the shape key, add comma separator
+      if (i < rank - 1) {
+        new_shape_key_ss << ",";
+      }
+    }
+    new_shape_key_ss << ")";
   }
 
   auto new_shape_key = new_shape_key_ss.str();
@@ -128,6 +138,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
             select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
         set_rt_device(device);
 
+        // Update active stream based on new device
+        compiled_engine->active_stream = c10::cuda::getStreamFromPool(true, device.id);
+        c10::cuda::setCurrentCUDAStream(compiled_engine->active_stream);
+
         // Target device is new device
         target_device += std::to_string(device.id);
 
@@ -157,6 +171,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       }
     }
 
+    // this is a buffer to store shape tensor input addresses throughout the runtime scope
+    std::list<std::vector<int32_t>> inputShapeTensorValues;
     {
       std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
       if (compiled_engine->profile_execution) {
@@ -252,23 +268,18 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   if (!CUDAGRAPHS_MODE) {
     // If not in cudagraphs mode, proceed with enqueueV3 as normal
-    c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
-    compiled_engine->exec_ctx->enqueueV3(stream);
+    compiled_engine->exec_ctx->enqueueV3(compiled_engine->active_stream);
   } else if (need_cudagraphs_record) {
     // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
 
     // Cudagraphs cannot record on the default stream, so use an alternate
     c10::cuda::CUDAStream stream = c10::cuda::getStreamFromPool(true, inputs[0].device().index());
-    c10::cuda::CUDAStreamGuard guard(stream);
-    compiled_engine->exec_ctx->enqueueV3(stream);
+    compiled_engine->exec_ctx->enqueueV3(compiled_engine->active_stream);
 
     compiled_engine->cudagraph.capture_begin();
-    compiled_engine->exec_ctx->enqueueV3(stream);
+    compiled_engine->exec_ctx->enqueueV3(compiled_engine->active_stream);
     compiled_engine->cudagraph.capture_end();
 
-    // Reset the stream to its original setting
-    guard.reset_stream(guard.original_stream());
-
   } else {
     // If the cudagraph has already been recorded, copy the input buffers and replay it
     for (auto i = 0; i < inputs.size(); i++) {
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -46,8 +46,9 @@ def __init__(
         self.input_buffers: List[torch.Tensor] = []
         self.output_buffers: List[torch.Tensor] = []
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
-        # {shape: cudagraph}
-        # limitation on CG
+        self.active_stream: Optional[torch.cuda.Stream] = None
+
+        # TODO: Make the below a Dictionary {shape: cudagraph}
         self.shape_key: Optional[str] = None
 
         # See https://github.com/pytorch/pytorch/blob/acfe237a71af609e837a34bb38048aa8acb8eb4d/torch/cuda/graphs.py#L92-L98
@@ -97,6 +98,10 @@ def _initialize(self) -> None:
             self.cudagraph = torch.cuda.CUDAGraph()
             self.graph_capturer = torch.cuda.graphs.graph(self.cudagraph)
 
+        # Set the active stream using the current device, with a high priority flag
+        self.active_stream = torch.cuda.Stream(torch.cuda.current_device(), priority=-1)
+        torch.cuda.set_stream(self.active_stream)
+
     def _check_initialized(self) -> None:
         if not self.initialized:
             raise RuntimeError("PythonTorchTensorRTModule is not initialized.")
@@ -185,9 +190,15 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             self.target_device_id,
                             self.target_device_properties,
                         )
+
+                        # Update current device
                         device = torch.device(device_id)
                         torch.cuda.set_device(device_id)
 
+                        # Update current stream
+                        self.active_stream = torch.cuda.Stream(device, priority=-1)
+                        torch.cuda.set_stream(self.active_stream)
+
                         contiguous_inputs = [
                             tensor.to(device) for tensor in contiguous_inputs
                         ]
@@ -306,21 +317,19 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             ):
 
                 if not cudagraphs_enabled:
-                    self.context.execute_async_v3(
-                        torch.cuda.current_stream().cuda_stream
-                    )
+                    self.context.execute_async_v3(self.active_stream)
 
                 elif need_cudagraphs_record:
                     self.input_buffers = list(contiguous_inputs)
                     self.output_buffers = list(outputs)
 
-                    current_stream = self.graph_capturer.capture_stream
+                    graph_capturer_stream = self.graph_capturer.capture_stream
 
-                    self.context.execute_async_v3(current_stream.cuda_stream)
-                    current_stream.synchronize()
+                    self.context.execute_async_v3(graph_capturer_stream.cuda_stream)
+                    graph_capturer_stream.synchronize()
 
                     with self.graph_capturer:
-                        self.context.execute_async_v3(current_stream.cuda_stream)
+                        self.context.execute_async_v3(graph_capturer_stream.cuda_stream)
 
                 else:
                     for idx, input_tensor in enumerate(inputs):
@@ -377,8 +386,8 @@ def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         """
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
-        # x: (3, 4), y: (4, 5) --> Key: (3, 4)(4, 5)
-        new_shape_key = "".join(str(tuple(t.shape)) for t in inputs)
+        # x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
+        new_shape_key = "".join(str(tuple(t.shape)).replace(" ", "") for t in inputs)
 
         # If the new shape key differs from the existing one,
         # invalidate the old shape key and remove the CUDAGraph
diff --git a/py/torch_tensorrt/runtime/cudagraphs.py b/py/torch_tensorrt/runtime/cudagraphs.py
@@ -4,7 +4,9 @@
 
 import torch
 
-if find_spec("torch_tensorrt._C") is not None:
+import torch_tensorrt
+
+if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime:
     _PY_RT_CUDAGRAPHS = torch.ops.tensorrt.get_cudagraphs_mode()
 else:
     _PY_RT_CUDAGRAPHS = False
diff --git a/tests/py/dynamo/runtime/test_cudagraphs.py b/tests/py/dynamo/runtime/test_cudagraphs.py
@@ -8,10 +8,6 @@
 from ..testing_utilities import DECIMALS_OF_AGREEMENT
 
 
-@unittest.skipIf(
-    not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime,
-    "Torch-TensorRT runtime is not available",
-)
 class TestCudagraphs(TestCase):
     def test_cudagraphs_on(self):
         torch_tensorrt.runtime.set_cudagraphs_mode(True)
@@ -66,6 +62,10 @@ def forward(self, x):
         )
         torch._dynamo.reset()
 
+    @unittest.skipIf(
+        not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime,
+        "Torch-TensorRT runtime is not available",
+    )
     def test_cudagraphs_enabled_inference_cpp(self):
         class SampleModel(torch.nn.Module):
             def forward(self, x):