@@ -2832,6 +2832,132 @@ TEST(CApiTest, ConfigureCudaArenaAndDemonstrateMemoryArenaShrinkage) {
28322832#endif
28332833
28342834#ifdef USE_TENSORRT
2835+ TEST (CApiTest, TestExternalCUDAStreamWithIOBinding) {
2836+ const auto & api = Ort::GetApi ();
2837+ Ort::SessionOptions session_options;
2838+
2839+ OrtTensorRTProviderOptionsV2* trt_options;
2840+ ASSERT_TRUE (api.CreateTensorRTProviderOptions (&trt_options) == nullptr );
2841+ std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype (api.ReleaseTensorRTProviderOptions )>
2842+ rel_trt_options (trt_options, api.ReleaseTensorRTProviderOptions );
2843+
2844+ // updating provider option with user provided compute stream
2845+ cudaStream_t compute_stream = nullptr ;
2846+ void * user_compute_stream = nullptr ;
2847+ cudaStreamCreate (&compute_stream);
2848+ ASSERT_TRUE (api.UpdateTensorRTProviderOptionsWithValue (rel_trt_options.get (), " user_compute_stream" , compute_stream) == nullptr );
2849+ ASSERT_TRUE (api.GetTensorRTProviderOptionsByName (rel_trt_options.get (), " user_compute_stream" , &user_compute_stream) == nullptr );
2850+ ASSERT_TRUE (user_compute_stream == (void *)compute_stream);
2851+
2852+ ASSERT_TRUE (api.SessionOptionsAppendExecutionProvider_TensorRT_V2 (
2853+ static_cast <OrtSessionOptions*>(session_options),
2854+ rel_trt_options.get ()) == nullptr );
2855+
2856+ Ort::Session session (*ort_env, MODEL_URI, session_options);
2857+ Ort::MemoryInfo info_cuda (" Cuda" , OrtAllocatorType::OrtArenaAllocator, 0 , OrtMemTypeDefault);
2858+
2859+ const std::array<int64_t , 2 > x_shape = {3 , 2 };
2860+ std::array<float , 3 * 2 > x_values = {1 .0f , 2 .0f , 3 .0f , 4 .0f , 5 .0f , 6 .0f };
2861+
2862+ /*
2863+ * Use cudaMallocHost() (pinned memory allocation) to create input/output tensors
2864+ */
2865+ float * input_data;
2866+ cudaMallocHost (&input_data, 3 * 2 * sizeof (float ));
2867+ ASSERT_NE (input_data, nullptr );
2868+ cudaMemcpy (input_data, x_values.data (), sizeof (float ) * x_values.size (), cudaMemcpyHostToDevice);
2869+
2870+ std::cout << " pinned memory allocation" << std::endl;
2871+ std::cout << " input tesnor:" << std::endl;
2872+ for (int i = 0 ; i < 6 ; i++) {
2873+ std::cout << input_data[i] << std::endl;
2874+ }
2875+
2876+ // Create an OrtValue tensor backed by data on CUDA memory
2877+ Ort::Value bound_x = Ort::Value::CreateTensor (info_cuda, reinterpret_cast <float *>(input_data), x_values.size (),
2878+ x_shape.data (), x_shape.size ());
2879+
2880+ const std::array<int64_t , 2 > expected_y_shape = {3 , 2 };
2881+ std::array<float , 3 * 2 > expected_y = {1 .0f , 4 .0f , 9 .0f , 16 .0f , 25 .0f , 36 .0f };
2882+
2883+ float * output_data;
2884+ cudaMallocHost (&output_data, 3 * 2 * sizeof (float ));
2885+ ASSERT_NE (output_data, nullptr );
2886+
2887+ // Create an OrtValue tensor backed by data on CUDA memory
2888+ Ort::Value bound_y = Ort::Value::CreateTensor (info_cuda, reinterpret_cast <float *>(output_data),
2889+ expected_y.size (), expected_y_shape.data (), expected_y_shape.size ());
2890+
2891+ // Create IoBinding for inputs and outputs.
2892+ Ort::IoBinding binding (session);
2893+ binding.BindInput (" X" , bound_x);
2894+ binding.BindOutput (" Y" , bound_y);
2895+
2896+ /*
2897+ * Use cudaMalloc() (pageable memory allocation first and then implicit pinned memory allocation) to create input/output tensors
2898+ */
2899+ float * input_data_2;
2900+ cudaMalloc (&input_data_2, 3 * 2 * sizeof (float ));
2901+ ASSERT_NE (input_data_2, nullptr );
2902+ cudaMemcpy (input_data_2, x_values.data (), sizeof (float ) * x_values.size (), cudaMemcpyHostToDevice);
2903+
2904+ // Create an OrtValue tensor backed by data on CUDA memory
2905+ Ort::Value bound_x_2 = Ort::Value::CreateTensor (info_cuda, reinterpret_cast <float *>(input_data_2), x_values.size (),
2906+ x_shape.data (), x_shape.size ());
2907+
2908+ float * output_data_2;
2909+ cudaMalloc (&output_data_2, 3 * 2 * sizeof (float ));
2910+ ASSERT_NE (output_data_2, nullptr );
2911+
2912+ // Create an OrtValue tensor backed by data on CUDA memory
2913+ Ort::Value bound_y_2 = Ort::Value::CreateTensor (info_cuda, reinterpret_cast <float *>(output_data_2),
2914+ expected_y.size (), expected_y_shape.data (), expected_y_shape.size ());
2915+
2916+ // Create IoBinding for inputs and outputs.
2917+ Ort::IoBinding binding_2 (session);
2918+ binding_2.BindInput (" X" , bound_x_2);
2919+ binding_2.BindOutput (" Y" , bound_y_2);
2920+
2921+ // Run with first iobindings
2922+ session.Run (Ort::RunOptions (), binding);
2923+
2924+ // Check the values against the bound raw memory (needs copying from device to host first)
2925+ std::array<float , 3 * 2 > y_values;
2926+ cudaMemcpy (y_values.data (), output_data, sizeof (float ) * y_values.size (), cudaMemcpyDeviceToHost);
2927+
2928+ std::cout << " pinned memory allocation" << std::endl;
2929+ std::cout << " output: " << std::endl;
2930+ for (auto y : y_values) {
2931+ std::cout << y << std::endl;
2932+ }
2933+ ASSERT_THAT (y_values, ::testing::ContainerEq (expected_y));
2934+
2935+ // Run with second iobindings
2936+ session.Run (Ort::RunOptions (), binding_2);
2937+
2938+ // Check the values against the bound raw memory (needs copying from device to host first)
2939+ cudaMemcpy (y_values.data (), output_data_2, sizeof (float ) * y_values.size (), cudaMemcpyDeviceToHost);
2940+
2941+ std::cout << " pageable memory allocation" << std::endl;
2942+ std::cout << " output: " << std::endl;
2943+ for (auto y : y_values) {
2944+ std::cout << y << std::endl;
2945+ }
2946+ ASSERT_THAT (y_values, ::testing::ContainerEq (expected_y));
2947+
2948+ // Clean up
2949+ binding.ClearBoundInputs ();
2950+ binding.ClearBoundOutputs ();
2951+ binding_2.ClearBoundInputs ();
2952+ binding_2.ClearBoundOutputs ();
2953+
2954+ cudaFreeHost (input_data);
2955+ cudaFreeHost (output_data);
2956+ cudaFree (input_data_2);
2957+ cudaFree (output_data_2);
2958+ cudaStreamDestroy (compute_stream);
2959+ }
2960+
28352961class CApiTensorRTTest : public testing ::Test, public ::testing::WithParamInterface<std::string> {};
28362962
28372963// This test uses CreateTensorRTProviderOptions/UpdateTensorRTProviderOptions APIs to configure and create a TensorRT Execution Provider
@@ -2849,15 +2975,6 @@ TEST_P(CApiTensorRTTest, TestConfigureTensorRTProviderOptions) {
28492975 ASSERT_TRUE (api.CreateTensorRTProviderOptions (&trt_options) == nullptr );
28502976 std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype (api.ReleaseTensorRTProviderOptions )> rel_trt_options (trt_options, api.ReleaseTensorRTProviderOptions );
28512977
2852- // Only test updating provider option with user provided compute stream
2853- cudaStream_t compute_stream = nullptr ;
2854- void * user_compute_stream = nullptr ;
2855- cudaStreamCreateWithFlags (&compute_stream, cudaStreamNonBlocking);
2856- ASSERT_TRUE (api.UpdateTensorRTProviderOptionsWithValue (rel_trt_options.get (), " user_compute_stream" , compute_stream) == nullptr );
2857- ASSERT_TRUE (api.GetTensorRTProviderOptionsByName (rel_trt_options.get (), " user_compute_stream" , &user_compute_stream) == nullptr );
2858- ASSERT_TRUE (user_compute_stream == (void *)compute_stream);
2859- cudaStreamDestroy (compute_stream);
2860-
28612978 const char * engine_cache_path = " ./trt_engine_folder" ;
28622979
28632980 std::vector<const char *> keys{" device_id" , " has_user_compute_stream" , " trt_fp16_enable" , " trt_int8_enable" , " trt_engine_cache_enable" ,
0 commit comments