Changed the stream handling

cehongwang · cehongwang · commit 46157051c974 · 2025-12-16T20:43:47.000Z
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -283,9 +283,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     auto current_device_id = -1;
     if (inputs.size() > 0) {
       current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
-      if (current_device_id != compiled_engine->current_device_id) {
-        compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
-      }
+      compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
     }
 
     { // Engine Execution (execute on engine stream)
@@ -370,9 +368,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     auto current_device_id = -1;
     if (inputs.size() > 0) {
       current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
-      if (current_device_id != compiled_engine->current_device_id) {
-        compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
-      }
+      compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
     }
 
     { // Engine Execution (execute on engine stream)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -43,6 +43,7 @@
 from torch_tensorrt.dynamo.partitioning._resource_partitioner import (
     resource_partition,
 )
+from torch_tensorrt.dynamo.runtime._stream_handler import handle_cuda_stream
 from torch_tensorrt.dynamo.utils import (
     deallocate_module,
     get_cpu_memory_usage,
@@ -950,6 +951,7 @@ def preserve_module_specs(
         if attr.startswith("_frozen_param"):
             delattr(gm, attr)
     trt_module = None
+
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
@@ -1090,6 +1092,8 @@ def preserve_module_specs(
         settings.use_fast_partitioner = True
 
     dryrun_stats_display(dryrun_tracker, settings.dryrun)
+    if not settings.dryrun:
+        handle_cuda_stream(partitioned_module)
 
     return partitioned_module
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -172,7 +172,6 @@ def __init__(
         self._input_buffers: List[torch.Tensor] = []
         self._output_buffers: List[torch.Tensor] = []
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
-        self._engine_stream: torch.cuda.Stream = torch.cuda.current_stream()
         self.output_tensors: Optional[List[torch.Tensor]] = None
         self.sync_stream = True
 
@@ -283,7 +282,6 @@ def setup_engine(self) -> None:
         ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})"
         # Stream handling: if the caller stream is the pytorch default stream, create a new engine stream
         # otherwise, use the caller stream and disable stream synchronization
-        self._engine_stream = torch.cuda.current_stream()
 
         self.initialized = True
         runtime = trt.Runtime(TRT_LOGGER)
@@ -564,10 +562,10 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                             self.cudagraph.enable_debug_mode()
 
                         with torch.cuda.graph(
-                            self.cudagraph, stream=self._engine_stream
+                            self.cudagraph, stream=torch.cuda.current_stream()
                         ):
                             self.context.execute_async_v3(
-                                self._engine_stream.cuda_stream
+                                torch.cuda.current_stream().cuda_stream
                             )
 
                         if self.profiling_enabled:
@@ -590,7 +588,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                     with warnings.catch_warnings():
                         try:
                             self.context.execute_async_v3(
-                                self._engine_stream.cuda_stream
+                                torch.cuda.current_stream().cuda_stream
                             )
                         except Warning as e:
                             breakpoint()
@@ -650,10 +648,9 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 else nullcontext()
             ):
 
-                with torch.cuda.stream(self._engine_stream):
-                    self.context.execute_async_v3(
-                        self._engine_stream.cuda_stream
-                    )  # The OutputAllocator is called by execute_async_v3()
+                self.context.execute_async_v3(
+                    torch.cuda.current_stream().cuda_stream
+                )  # The OutputAllocator is called by execute_async_v3()
 
             with (
                 torch.autograd.profiler.record_function(
diff --git a/py/torch_tensorrt/dynamo/runtime/_stream_handler.py b/py/torch_tensorrt/dynamo/runtime/_stream_handler.py
@@ -0,0 +1,46 @@
+import torch
+import torch.fx
+
+
+def handle_cuda_stream(
+    partitioned_module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    for node in partitioned_module.graph.nodes:
+        if node.op == "placeholder":
+            with partitioned_module.graph.inserting_before(node):
+                partitioned_module.graph.call_function(
+                    torch.ops.tensorrt.enter_compute_stream
+                )
+        elif node.op == "output":
+            with partitioned_module.graph.inserting_before(node):
+                partitioned_module.graph.call_function(
+                    torch.ops.tensorrt.exit_compute_stream
+                )
+
+    partitioned_module.graph.lint()
+    partitioned_module.recompile()
+    return partitioned_module
+
+
+@torch.library.custom_op("tensorrt::enter_compute_stream", mutates_args=())
+def enter_compute_stream() -> None:
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.default_stream())
+    torch.cuda.set_stream(stream)
+
+
+@torch.library.custom_op("tensorrt::exit_compute_stream", mutates_args=())
+def exit_compute_stream() -> None:
+    stream = torch.cuda.current_stream()
+    torch.cuda.default_stream().wait_stream(stream)
+    torch.cuda.set_stream(torch.cuda.default_stream())
+
+
+@torch.library.register_fake("tensorrt::enter_compute_stream")
+def fake_enter_compute_stream() -> None:
+    pass
+
+
+@torch.library.register_fake("tensorrt::exit_compute_stream")
+def fake_exit_compute_stream() -> None:
+    pass