3939
4040#include "opal/datatype/opal_datatype_cuda.h"
4141#include "opal/mca/common/cuda/common_cuda.h"
42- #include "opal/mca/btl/smcuda/btl_smcuda.h"
43-
44- #define CUDA_DDT_WITH_RDMA 1
4542
4643size_t mca_pml_ob1_rdma_cuda_btls (
4744 mca_bml_base_endpoint_t * bml_endpoint ,
@@ -93,7 +90,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
9390
9491 rc = mca_pml_ob1_rdma_cuda_btl_register_data (sendreq -> req_endpoint , sendreq -> req_rdma , sendreq -> req_rdma_cnt , convertor );
9592 if (rc != 0 ) {
96- opal_output ( 0 , "Failed to register convertor, rc= %d\n" , rc );
93+ OPAL_OUTPUT_VERBOSE (( 0 , mca_common_cuda_output , "Failed to register convertor, rc= %d\n" , rc ) );
9794 return rc ;
9895 }
9996 rc = mca_pml_ob1_send_request_start_rdma (sendreq , bml_btl ,
@@ -127,7 +124,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
127124 convertor -> gpu_buffer_ptr = base ;
128125 convertor -> gpu_buffer_size = buffer_size ;
129126 sendreq -> req_send .req_bytes_packed = convertor -> local_size ;
130- opal_output ( 0 , " malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n" , base , convertor -> local_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_depth );
127+ OPAL_OUTPUT_VERBOSE (( OPAL_DATATYPE_CUDA_VERBOSE_LEVEL , mca_common_cuda_output , "RDMA malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n" , base , convertor -> local_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_size , bml_btl -> btl -> btl_cuda_ddt_pipeline_depth ) );
131128 if ( 0 != (sendreq -> req_rdma_cnt = (uint32_t )mca_pml_ob1_rdma_cuda_btls (
132129 sendreq -> req_endpoint ,
133130 base ,
@@ -137,7 +134,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
137134 convertor -> flags &= ~CONVERTOR_CUDA_ASYNC ;
138135 rc = mca_pml_ob1_rdma_cuda_btl_register_data (sendreq -> req_endpoint , sendreq -> req_rdma , sendreq -> req_rdma_cnt , convertor );
139136 if (rc != 0 ) {
140- opal_output ( 0 , "Failed to register convertor, rc= %d\n" , rc );
137+ OPAL_OUTPUT_VERBOSE (( 0 , mca_common_cuda_output , "Failed to register convertor, rc= %d\n" , rc ) );
141138 return rc ;
142139 }
143140 rc = mca_pml_ob1_send_request_start_rdma (sendreq , bml_btl ,
@@ -159,13 +156,12 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
159156 buffer_size = convertor -> local_size ;
160157 }
161158 base = opal_cuda_malloc_gpu_buffer (buffer_size , 0 );
159+ OPAL_OUTPUT_VERBOSE ((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL , mca_common_cuda_output , "Copy in/out malloc GPU buffer %p, pipeline_size %d\n" , base , convertor -> pipeline_size ));
162160 convertor -> gpu_buffer_ptr = base ;
163161 convertor -> gpu_buffer_size = buffer_size ;
164162 convertor -> pipeline_seq = 0 ;
165163 rc = mca_pml_ob1_send_request_start_rndv (sendreq , bml_btl , 0 , 0 );
166164 }
167-
168-
169165 } else {
170166 if (bml_btl -> btl -> btl_cuda_max_send_size != 0 ) {
171167 convertor -> pipeline_size = bml_btl -> btl -> btl_cuda_max_send_size ;
@@ -179,6 +175,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
179175 buffer_size = convertor -> local_size ;
180176 }
181177 base = opal_cuda_malloc_gpu_buffer (buffer_size , 0 );
178+ OPAL_OUTPUT_VERBOSE ((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL , mca_common_cuda_output , "Copy in/out malloc GPU buffer %p, pipeline_size %d\n" , base , convertor -> pipeline_size ));
182179 convertor -> gpu_buffer_ptr = base ;
183180 convertor -> gpu_buffer_size = buffer_size ;
184181 convertor -> pipeline_seq = 0 ;
@@ -188,8 +185,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
188185 return rc ;
189186}
190187
191-
192-
193188size_t mca_pml_ob1_rdma_cuda_btls (
194189 mca_bml_base_endpoint_t * bml_endpoint ,
195190 unsigned char * base ,
0 commit comments