Skip to content

Commit 44d59cf

Browse files
committed
put ompi_datatype_t back to 512 byte, clean up printf and unused functions
put ompi_datatype_t back to 512 byte, plus some cleanup clean up printf, now use OPAL_OUTPUT_VERBOSE rename function names to opal_datatype_cuda_xxx more cleanup clean up unused function
1 parent 0f63a2c commit 44d59cf

19 files changed

+303
-2314
lines changed

ompi/datatype/ompi_datatype.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_datatype_t);
9494
/* Using set constant for padding of the DATATYPE handles because the size of
9595
* base structure is very close to being the same no matter the bitness.
9696
*/
97-
#define PREDEFINED_DATATYPE_PAD (1024)
97+
#define PREDEFINED_DATATYPE_PAD (512)
9898

9999
struct ompi_predefined_datatype_t {
100100
struct ompi_datatype_t dt;

ompi/mca/pml/ob1/pml_ob1_cuda.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@
3939

4040
#include "opal/datatype/opal_datatype_cuda.h"
4141
#include "opal/mca/common/cuda/common_cuda.h"
42-
#include "opal/mca/btl/smcuda/btl_smcuda.h"
43-
44-
#define CUDA_DDT_WITH_RDMA 1
4542

4643
size_t mca_pml_ob1_rdma_cuda_btls(
4744
mca_bml_base_endpoint_t* bml_endpoint,
@@ -93,7 +90,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
9390

9491
rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint, sendreq->req_rdma, sendreq->req_rdma_cnt, convertor);
9592
if (rc != 0) {
96-
opal_output(0, "Failed to register convertor, rc= %d\n", rc);
93+
OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
9794
return rc;
9895
}
9996
rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
@@ -127,7 +124,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
127124
convertor->gpu_buffer_ptr = base;
128125
convertor->gpu_buffer_size = buffer_size;
129126
sendreq->req_send.req_bytes_packed = convertor->local_size;
130-
opal_output(0, "malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth);
127+
OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "RDMA malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth));
131128
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
132129
sendreq->req_endpoint,
133130
base,
@@ -137,7 +134,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
137134
convertor->flags &= ~CONVERTOR_CUDA_ASYNC;
138135
rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint, sendreq->req_rdma, sendreq->req_rdma_cnt, convertor);
139136
if (rc != 0) {
140-
opal_output(0, "Failed to register convertor, rc= %d\n", rc);
137+
OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
141138
return rc;
142139
}
143140
rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
@@ -159,13 +156,12 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
159156
buffer_size = convertor->local_size;
160157
}
161158
base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
159+
OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Copy in/out malloc GPU buffer %p, pipeline_size %d\n", base, convertor->pipeline_size));
162160
convertor->gpu_buffer_ptr = base;
163161
convertor->gpu_buffer_size = buffer_size;
164162
convertor->pipeline_seq = 0;
165163
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
166164
}
167-
168-
169165
} else {
170166
if (bml_btl->btl->btl_cuda_max_send_size != 0) {
171167
convertor->pipeline_size = bml_btl->btl->btl_cuda_max_send_size;
@@ -179,6 +175,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
179175
buffer_size = convertor->local_size;
180176
}
181177
base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
178+
OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Copy in/out malloc GPU buffer %p, pipeline_size %d\n", base, convertor->pipeline_size));
182179
convertor->gpu_buffer_ptr = base;
183180
convertor->gpu_buffer_size = buffer_size;
184181
convertor->pipeline_seq = 0;
@@ -188,8 +185,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
188185
return rc;
189186
}
190187

191-
192-
193188
size_t mca_pml_ob1_rdma_cuda_btls(
194189
mca_bml_base_endpoint_t* bml_endpoint,
195190
unsigned char* base,

ompi/mca/pml/ob1/pml_ob1_recvreq.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
577577
} else {
578578
buffer_size = convertor->local_size;
579579
}
580-
printf("!!!!!!!!!!malloc size %lu\n", buffer_size);
580+
OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Malloc GPU buffer size %lu for frag_copy_start\n", buffer_size));
581581
convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
582582
convertor->gpu_buffer_size = buffer_size;
583583
convertor->pipeline_seq = 0;
@@ -611,7 +611,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
611611
* checks the stream events. If we get an error, abort. Should get message
612612
* from CUDA code about what went wrong. */
613613
result = mca_common_cuda_record_htod_event("pml", des, cuda_stream);
614-
printf("!!!!!!!!!!!record h2d\n");
614+
OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Record h2d cuda event\n"));
615615
if (OMPI_SUCCESS != result) {
616616
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
617617
ompi_rte_abort(-1, NULL);
@@ -650,7 +650,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
650650
if(recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed) {
651651
opal_convertor_t *convertor = &(recvreq)->req_recv.req_base.req_convertor;
652652
if (convertor->gpu_buffer_ptr != NULL) {
653-
printf("!!!!!!!!!!!!!!!!!!!!!!!i free buffer %p\n", convertor->gpu_buffer_ptr);
653+
OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Free GPU pack/unpack buffer %p\n", convertor->gpu_buffer_ptr));
654654
opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
655655
convertor->gpu_buffer_ptr = NULL;
656656
}

ompi/mca/pml/ob1/pml_ob1_sendreq.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
671671
sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
672672
if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
673673
data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;
674-
printf("START RMDA data_ptr %p\n", data_ptr);
675674
} else {
676675
opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
677676
}

0 commit comments

Comments
 (0)