@@ -111,7 +111,6 @@ struct cudaFunctionTable {
111
111
typedef struct cudaFunctionTable cudaFunctionTable_t ;
112
112
static cudaFunctionTable_t cuFunc ;
113
113
114
- #define NB_IPC_STREAM 4
115
114
116
115
static int stage_one_init_ref_count = 0 ;
117
116
static bool stage_three_init_complete = false;
@@ -123,8 +122,7 @@ bool mca_common_cuda_enabled = false;
123
122
static bool mca_common_cuda_register_memory = true;
124
123
static bool mca_common_cuda_warning = false;
125
124
static opal_list_t common_cuda_memory_registrations ;
126
- static CUstream ipcStream [NB_IPC_STREAM ];
127
- static int current_ipc_stream_id = 0 ;
125
+ static CUstream ipcStream = NULL ;
128
126
static CUstream dtohStream = NULL ;
129
127
static CUstream htodStream = NULL ;
130
128
static CUstream memcpyStream = NULL ;
@@ -821,14 +819,12 @@ static int mca_common_cuda_stage_three_init(void)
821
819
}
822
820
823
821
/* Create stream for use in ipc asynchronous copies */
824
- for (i = 0 ; i < NB_IPC_STREAM ; i ++ ) {
825
- res = cuFunc .cuStreamCreate (& ipcStream [i ], 0 );
826
- if (OPAL_UNLIKELY (res != CUDA_SUCCESS )) {
827
- opal_show_help ("help-mpi-common-cuda.txt" , "cuStreamCreate failed" ,
828
- true, OPAL_PROC_MY_HOSTNAME , res );
829
- rc = OPAL_ERROR ;
830
- goto cleanup_and_error ;
831
- }
822
+ res = cuFunc .cuStreamCreate (& ipcStream , 0 );
823
+ if (OPAL_UNLIKELY (res != CUDA_SUCCESS )) {
824
+ opal_show_help ("help-mpi-common-cuda.txt" , "cuStreamCreate failed" ,
825
+ true, OPAL_PROC_MY_HOSTNAME , res );
826
+ rc = OPAL_ERROR ;
827
+ goto cleanup_and_error ;
832
828
}
833
829
834
830
/* Create stream for use in dtoh asynchronous copies */
@@ -1010,10 +1006,8 @@ void mca_common_cuda_fini(void)
1010
1006
if (NULL != cuda_event_unpack_callback_frag_array ) {
1011
1007
free (cuda_event_unpack_callback_frag_array );
1012
1008
}
1013
- for (i = 0 ; i < NB_IPC_STREAM ; i ++ ) {
1014
- if ((NULL != ipcStream [i ]) && ctx_ok ) {
1015
- cuFunc .cuStreamDestroy (ipcStream [i ]);
1016
- }
1009
+ if ((NULL != ipcStream ) && ctx_ok ) {
1010
+ cuFunc .cuStreamDestroy (ipcStream );
1017
1011
}
1018
1012
if ((NULL != dtohStream ) && ctx_ok ) {
1019
1013
cuFunc .cuStreamDestroy (dtohStream );
@@ -1427,7 +1421,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1427
1421
* to measure the advantages of asynchronous copies. */
1428
1422
if (OPAL_LIKELY (mca_common_cuda_async )) {
1429
1423
// printf("I use async memcpy\n");
1430
- result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream [ current_ipc_stream_id ] );
1424
+ result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream );
1431
1425
if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1432
1426
opal_show_help ("help-mpi-common-cuda.txt" , "cuMemcpyAsync failed" ,
1433
1427
true, dst , src , amount , result );
@@ -1438,11 +1432,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1438
1432
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d" ,
1439
1433
dst , src , (int )amount );
1440
1434
}
1441
- result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream [current_ipc_stream_id ]);
1442
- current_ipc_stream_id ++ ;
1443
- if (current_ipc_stream_id >= NB_IPC_STREAM ) {
1444
- current_ipc_stream_id = 0 ;
1445
- }
1435
+ result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream );
1446
1436
if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1447
1437
opal_show_help ("help-mpi-common-cuda.txt" , "cuEventRecord failed" ,
1448
1438
true, OPAL_PROC_MY_HOSTNAME , result );
@@ -1461,7 +1451,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1461
1451
* done = 0 ;
1462
1452
} else {
1463
1453
/* Mimic the async function so they use the same memcpy call. */
1464
- result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream [ 0 ] );
1454
+ result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream );
1465
1455
if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1466
1456
opal_show_help ("help-mpi-common-cuda.txt" , "cuMemcpyAsync failed" ,
1467
1457
true, dst , src , amount , result );
@@ -1474,7 +1464,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1474
1464
}
1475
1465
1476
1466
/* Record an event, then wait for it to complete with calls to cuEventQuery */
1477
- result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream [ 0 ] );
1467
+ result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream );
1478
1468
if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1479
1469
opal_show_help ("help-mpi-common-cuda.txt" , "cuEventRecord failed" ,
1480
1470
true, OPAL_PROC_MY_HOSTNAME , result );
0 commit comments