Skip to content

Commit c2a29eb

Browse files
committed
roll back not use multiple ipc stream
1 parent a3d4db7 commit c2a29eb

File tree

1 file changed

+13
-23
lines changed

1 file changed

+13
-23
lines changed

opal/mca/common/cuda/common_cuda.c

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ struct cudaFunctionTable {
111111
typedef struct cudaFunctionTable cudaFunctionTable_t;
112112
static cudaFunctionTable_t cuFunc;
113113

114-
#define NB_IPC_STREAM 4
115114

116115
static int stage_one_init_ref_count = 0;
117116
static bool stage_three_init_complete = false;
@@ -123,8 +122,7 @@ bool mca_common_cuda_enabled = false;
123122
static bool mca_common_cuda_register_memory = true;
124123
static bool mca_common_cuda_warning = false;
125124
static opal_list_t common_cuda_memory_registrations;
126-
static CUstream ipcStream[NB_IPC_STREAM];
127-
static int current_ipc_stream_id = 0;
125+
static CUstream ipcStream = NULL;
128126
static CUstream dtohStream = NULL;
129127
static CUstream htodStream = NULL;
130128
static CUstream memcpyStream = NULL;
@@ -821,14 +819,12 @@ static int mca_common_cuda_stage_three_init(void)
821819
}
822820

823821
/* Create stream for use in ipc asynchronous copies */
824-
for (i = 0; i < NB_IPC_STREAM; i++) {
825-
res = cuFunc.cuStreamCreate(&ipcStream[i], 0);
826-
if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
827-
opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
828-
true, OPAL_PROC_MY_HOSTNAME, res);
829-
rc = OPAL_ERROR;
830-
goto cleanup_and_error;
831-
}
822+
res = cuFunc.cuStreamCreate(&ipcStream, 0);
823+
if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
824+
opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
825+
true, OPAL_PROC_MY_HOSTNAME, res);
826+
rc = OPAL_ERROR;
827+
goto cleanup_and_error;
832828
}
833829

834830
/* Create stream for use in dtoh asynchronous copies */
@@ -1010,10 +1006,8 @@ void mca_common_cuda_fini(void)
10101006
if (NULL != cuda_event_unpack_callback_frag_array) {
10111007
free(cuda_event_unpack_callback_frag_array);
10121008
}
1013-
for (i = 0; i < NB_IPC_STREAM; i++) {
1014-
if ((NULL != ipcStream[i]) && ctx_ok) {
1015-
cuFunc.cuStreamDestroy(ipcStream[i]);
1016-
}
1009+
if ((NULL != ipcStream) && ctx_ok) {
1010+
cuFunc.cuStreamDestroy(ipcStream);
10171011
}
10181012
if ((NULL != dtohStream) && ctx_ok) {
10191013
cuFunc.cuStreamDestroy(dtohStream);
@@ -1427,7 +1421,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14271421
* to measure the advantages of asynchronous copies. */
14281422
if (OPAL_LIKELY(mca_common_cuda_async)) {
14291423
// printf("I use async memcpy\n");
1430-
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream[current_ipc_stream_id]);
1424+
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
14311425
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14321426
opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
14331427
true, dst, src, amount, result);
@@ -1438,11 +1432,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14381432
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
14391433
dst, src, (int)amount);
14401434
}
1441-
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream[current_ipc_stream_id]);
1442-
current_ipc_stream_id ++;
1443-
if (current_ipc_stream_id >= NB_IPC_STREAM) {
1444-
current_ipc_stream_id = 0;
1445-
}
1435+
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
14461436
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14471437
opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
14481438
true, OPAL_PROC_MY_HOSTNAME, result);
@@ -1461,7 +1451,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14611451
*done = 0;
14621452
} else {
14631453
/* Mimic the async function so they use the same memcpy call. */
1464-
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream[0]);
1454+
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
14651455
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14661456
opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
14671457
true, dst, src, amount, result);
@@ -1474,7 +1464,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14741464
}
14751465

14761466
/* Record an event, then wait for it to complete with calls to cuEventQuery */
1477-
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream[0]);
1467+
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
14781468
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14791469
opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
14801470
true, OPAL_PROC_MY_HOSTNAME, result);

0 commit comments

Comments
 (0)