39
39
40
40
#include "opal/datatype/opal_datatype_cuda.h"
41
41
#include "opal/mca/common/cuda/common_cuda.h"
42
- #include "opal/mca/btl/smcuda/btl_smcuda.h"
43
-
44
- #define CUDA_DDT_WITH_RDMA 1
45
42
46
43
size_t mca_pml_ob1_rdma_cuda_btls (
47
44
mca_bml_base_endpoint_t * bml_endpoint ,
@@ -93,7 +90,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
93
90
94
91
rc = mca_pml_ob1_rdma_cuda_btl_register_data (sendreq -> req_endpoint , sendreq -> req_rdma , sendreq -> req_rdma_cnt , convertor );
95
92
if (rc != 0 ) {
96
- opal_output ( 0 , "Failed to register convertor, rc= %d\n" , rc );
93
+ OPAL_OUTPUT_VERBOSE (( 0 , mca_common_cuda_output , "Failed to register convertor, rc= %d\n" , rc ) );
97
94
return rc ;
98
95
}
99
96
rc = mca_pml_ob1_send_request_start_rdma (sendreq , bml_btl ,
@@ -127,7 +124,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
127
124
convertor -> gpu_buffer_ptr = base ;
128
125
convertor -> gpu_buffer_size = buffer_size ;
129
126
sendreq -> req_send .req_bytes_packed = convertor -> local_size ;
130
- opal_output ( 0 , " malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n" , base , convertor -> local_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_depth );
127
+ OPAL_OUTPUT_VERBOSE (( OPAL_DATATYPE_CUDA_VERBOSE_LEVEL , mca_common_cuda_output , "RDMA malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n" , base , convertor -> local_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_depth ) );
131
128
if ( 0 != (sendreq -> req_rdma_cnt = (uint32_t )mca_pml_ob1_rdma_cuda_btls (
132
129
sendreq -> req_endpoint ,
133
130
base ,
@@ -137,7 +134,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
137
134
convertor -> flags &= ~CONVERTOR_CUDA_ASYNC ;
138
135
rc = mca_pml_ob1_rdma_cuda_btl_register_data (sendreq -> req_endpoint , sendreq -> req_rdma , sendreq -> req_rdma_cnt , convertor );
139
136
if (rc != 0 ) {
140
- opal_output ( 0 , "Failed to register convertor, rc= %d\n" , rc );
137
+ OPAL_OUTPUT_VERBOSE (( 0 , mca_common_cuda_output , "Failed to register convertor, rc= %d\n" , rc ) );
141
138
return rc ;
142
139
}
143
140
rc = mca_pml_ob1_send_request_start_rdma (sendreq , bml_btl ,
@@ -159,13 +156,12 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
159
156
buffer_size = convertor -> local_size ;
160
157
}
161
158
base = opal_cuda_malloc_gpu_buffer (buffer_size , 0 );
159
+ OPAL_OUTPUT_VERBOSE ((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL , mca_common_cuda_output , "Copy in/out malloc GPU buffer %p, pipeline_size %d\n" , base , convertor -> pipeline_size ));
162
160
convertor -> gpu_buffer_ptr = base ;
163
161
convertor -> gpu_buffer_size = buffer_size ;
164
162
convertor -> pipeline_seq = 0 ;
165
163
rc = mca_pml_ob1_send_request_start_rndv (sendreq , bml_btl , 0 , 0 );
166
164
}
167
-
168
-
169
165
} else {
170
166
if (bml_btl -> btl -> btl_cuda_max_send_size != 0 ) {
171
167
convertor -> pipeline_size = bml_btl -> btl -> btl_cuda_max_send_size ;
@@ -179,6 +175,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
179
175
buffer_size = convertor -> local_size ;
180
176
}
181
177
base = opal_cuda_malloc_gpu_buffer (buffer_size , 0 );
178
+ OPAL_OUTPUT_VERBOSE ((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL , mca_common_cuda_output , "Copy in/out malloc GPU buffer %p, pipeline_size %d\n" , base , convertor -> pipeline_size ));
182
179
convertor -> gpu_buffer_ptr = base ;
183
180
convertor -> gpu_buffer_size = buffer_size ;
184
181
convertor -> pipeline_seq = 0 ;
@@ -188,8 +185,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
188
185
return rc ;
189
186
}
190
187
191
-
192
-
193
188
size_t mca_pml_ob1_rdma_cuda_btls (
194
189
mca_bml_base_endpoint_t * bml_endpoint ,
195
190
unsigned char * base ,
0 commit comments