pytorch
diff --git a/‎WORKSPACE
Lines changed: 0 additions & 2 deletions b/‎WORKSPACE
Lines changed: 0 additions & 2 deletions
diff --git a/‎core/runtime/TRTEngine.cpp
Lines changed: 14 additions & 0 deletions b/‎core/runtime/TRTEngine.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎core/runtime/TRTEngine.h
Lines changed: 10 additions & 0 deletions b/‎core/runtime/TRTEngine.h
Lines changed: 10 additions & 0 deletions
@@ -101,8 +101,6 @@ http_archive(
     ],
 )
 
-
-
 ####################################################################################
 # Locally installed dependencies (use in cases of custom dependencies or aarch64)
 ####################################################################################
 
@@ -2,6 +2,7 @@
 
 #include <cuda_runtime.h>
 #include "NvInfer.h"
+#include "c10/cuda/CUDAStream.h"
 #include "torch/csrc/jit/frontend/function_schema_parser.h"
 #include "torch/cuda.h"
 
@@ -70,6 +71,15 @@ TRTEngine::TRTEngine(
   multi_gpu_device_check();
   set_rt_device(device_info);
 
+  // Set active stream to non-default stream
+  auto current_stream = c10::cuda::getCurrentCUDAStream(device_info.id);
+  if (current_stream == c10::cuda::getDefaultCUDAStream(device_info.id)) {
+    active_stream = c10::cuda::getStreamFromPool(false, device_info.id);
+    c10::cuda::setCurrentCUDAStream(active_stream);
+  } else {
+    active_stream = current_stream;
+  }
+
   rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));
 
   name = slugify(mod_name);
@@ -112,7 +122,9 @@ TRTEngine::TRTEngine(
 
     num_io = std::make_pair(inputs, outputs);
     in_binding_names.resize(inputs);
+    input_buffers.resize(inputs);
     out_binding_names.resize(outputs);
+    output_buffers.resize(outputs);
     for (int64_t x = 0; x < cuda_engine->getNbIOTensors(); x++) {
       std::string bind_name = cuda_engine->getIOTensorName(x);
       if (cuda_engine->getTensorIOMode(bind_name.c_str()) == nvinfer1::TensorIOMode::kINPUT) {
@@ -124,6 +136,7 @@ TRTEngine::TRTEngine(
   } else {
     uint64_t inputs_size = _in_binding_names.size();
     in_binding_names.resize(inputs_size);
+    input_buffers.resize(inputs_size);
     for (uint64_t pyt_idx = 0; pyt_idx < inputs_size; pyt_idx++) {
       auto binding_name = _in_binding_names[pyt_idx];
       // Check if the binding name provided is in the list of engine's bindings
@@ -153,6 +166,7 @@ TRTEngine::TRTEngine(
 
     uint64_t outputs = _out_binding_names.size();
     out_binding_names.resize(outputs);
+    output_buffers.resize(outputs);
     for (size_t pyt_idx = 0; pyt_idx < outputs; pyt_idx++) {
       auto binding_name = _out_binding_names[pyt_idx];
       // Check if the binding name provided is in the list of engine's bindings
 
@@ -7,7 +7,9 @@
 #include <utility>
 
 #include "ATen/core/function_schema.h"
+#include "ATen/cuda/CUDAGraph.h"
 #include "NvInfer.h"
+#include "c10/cuda/CUDAStream.h"
 #include "torch/custom_class.h"
 
 #include "core/runtime/TRTEngineProfiler.h"
@@ -65,6 +67,14 @@ struct TRTEngine : torch::CustomClassHolder {
   void dump_engine_layer_info();
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
+
+  // CUDAGraph-Related Functionality
+  at::cuda::CUDAGraph cudagraph = {};
+  at::cuda::CUDAStream active_stream = c10::cuda::getDefaultCUDAStream();
+  std::vector<at::Tensor> input_buffers = {};
+  std::vector<at::Tensor> output_buffers = {};
+  std::string shape_key;
+
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
Original file line number	Diff line number	Diff line change
`@@ -101,8 +101,6 @@ http_archive(`
`101`	`101`	`],`
`102`	`102`	`)`
`103`	`103`
`104`		`-`
`105`		`-`
`106`	`104`	`####################################################################################`
`107`	`105`	`# Locally installed dependencies (use in cases of custom dependencies or aarch64)`
`108`	`106`	`####################################################################################`