ICLDisco
diff --git a/‎ompi/datatype/ompi_datatype.h
Lines changed: 1 addition & 1 deletion b/‎ompi/datatype/ompi_datatype.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎ompi/mca/pml/ob1/pml_ob1_cuda.c
Lines changed: 5 additions & 10 deletions b/‎ompi/mca/pml/ob1/pml_ob1_cuda.c
Lines changed: 5 additions & 10 deletions
diff --git a/‎ompi/mca/pml/ob1/pml_ob1_recvreq.c
Lines changed: 3 additions & 3 deletions b/‎ompi/mca/pml/ob1/pml_ob1_recvreq.c
Lines changed: 3 additions & 3 deletions
diff --git a/‎ompi/mca/pml/ob1/pml_ob1_sendreq.c
Lines changed: 0 additions & 1 deletion b/‎ompi/mca/pml/ob1/pml_ob1_sendreq.c
Lines changed: 0 additions & 1 deletion
@@ -94,7 +94,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_datatype_t);
 /* Using set constant for padding of the DATATYPE handles because the size of
  * base structure is very close to being the same no matter the bitness.
  */
-#define PREDEFINED_DATATYPE_PAD (1024)
+#define PREDEFINED_DATATYPE_PAD (512)
 
 struct ompi_predefined_datatype_t {
     struct ompi_datatype_t dt;
 
@@ -39,9 +39,6 @@
 
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
-#include "opal/mca/btl/smcuda/btl_smcuda.h"
-
-#define CUDA_DDT_WITH_RDMA 1
 
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
@@ -93,7 +90,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
 
             rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint, sendreq->req_rdma, sendreq->req_rdma_cnt, convertor); 
             if (rc != 0) {
-                opal_output(0, "Failed to register convertor, rc= %d\n", rc);
+                OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
                 return rc;
             }  
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
@@ -127,7 +124,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             convertor->gpu_buffer_ptr = base;
             convertor->gpu_buffer_size = buffer_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
-            opal_output(0, "malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth);
+            OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "RDMA malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth));
             if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
                                                                            sendreq->req_endpoint,
                                                                            base,
@@ -137,7 +134,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 convertor->flags &= ~CONVERTOR_CUDA_ASYNC;
                 rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint, sendreq->req_rdma, sendreq->req_rdma_cnt, convertor); 
                 if (rc != 0) {
-                    opal_output(0, "Failed to register convertor, rc= %d\n", rc);
+                    OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
                     return rc;
                 }
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
@@ -159,13 +156,12 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                     buffer_size = convertor->local_size;
                 }
                 base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+                OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Copy in/out malloc GPU buffer %p, pipeline_size %d\n", base, convertor->pipeline_size));
                 convertor->gpu_buffer_ptr = base;
                 convertor->gpu_buffer_size = buffer_size;
                 convertor->pipeline_seq = 0;
                 rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
             }
-
-            
         } else {
             if (bml_btl->btl->btl_cuda_max_send_size != 0) {
                 convertor->pipeline_size = bml_btl->btl->btl_cuda_max_send_size;
@@ -179,6 +175,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 buffer_size = convertor->local_size;
             }
             base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Copy in/out malloc GPU buffer %p, pipeline_size %d\n", base, convertor->pipeline_size));
             convertor->gpu_buffer_ptr = base;
             convertor->gpu_buffer_size = buffer_size;
             convertor->pipeline_seq = 0;
@@ -188,8 +185,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
     return rc;
 }
 
-
-
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
 
@@ -577,7 +577,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
                 } else {
                     buffer_size = convertor->local_size;
                 }
-                printf("!!!!!!!!!!malloc size %lu\n", buffer_size);
+                OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Malloc GPU buffer size %lu for frag_copy_start\n", buffer_size));
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
                 convertor->gpu_buffer_size = buffer_size;
                 convertor->pipeline_seq = 0;
@@ -611,7 +611,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
      * checks the stream events.  If we get an error, abort.  Should get message
      * from CUDA code about what went wrong. */
     result = mca_common_cuda_record_htod_event("pml", des, cuda_stream);
-    printf("!!!!!!!!!!!record h2d\n");
+    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Record h2d cuda event\n"));
     if (OMPI_SUCCESS != result) {
         opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
         ompi_rte_abort(-1, NULL);
@@ -650,7 +650,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
     if(recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed) {
         opal_convertor_t *convertor = &(recvreq)->req_recv.req_base.req_convertor;
         if (convertor->gpu_buffer_ptr != NULL) {
-            printf("!!!!!!!!!!!!!!!!!!!!!!!i free buffer %p\n", convertor->gpu_buffer_ptr);
+            OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Free GPU pack/unpack buffer %p\n", convertor->gpu_buffer_ptr));
             opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }    
 
@@ -671,7 +671,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
         if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
             data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;
-            printf("START RMDA data_ptr %p\n", data_ptr);
         } else {
             opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
         }
Original file line number	Diff line number	Diff line change
`@@ -671,7 +671,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,`
`671`	`671`	`sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;`
`672`	`672`	`if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {`
`673`	`673`	`data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;`
`674`		`- printf("START RMDA data_ptr %p\n", data_ptr);`
`675`	`674`	`} else {`
`676`	`675`	`opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);`
`677`	`676`	`}`