Skip to content

Commit 3aca4af

Browse files
author
Michael Heinz
committed
REF6976 Silent failure of OMPI over OFI with large messages sizes
INTERNAL: STL-59403 The OFI (libfabric) MTL does not respect the maximum message size parameter that OFI provides in the fi_info data. This patch adds this missing max_msg_size field to the mca_ofi_module_t structure and adds a length check to the low-level send routines. Change-Id: I05aa71d332f2df897133b30c28bf37d98f061996 Signed-off-by: Michael Heinz <[email protected]> Reviewed-by: Adam Goldman <[email protected]> Reviewed-by: Brendan Cunningham <[email protected]>
1 parent 6159afc commit 3aca4af

File tree

4 files changed

+25
-3
lines changed

4 files changed

+25
-3
lines changed

ompi/mca/mtl/ofi/help-mtl-ofi.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,5 @@ recoverable and your application is likely to abort.
7575
Local host: %s
7676
Remote host: %s
7777
Error: %s (%d)
78+
[message too big]
79+
Message size %llu bigger than supported by selected transport. Max = %llu

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,13 +421,22 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
421421
sep_peer_fiaddr = fi_rx_addr(endpoint->peer_fiaddr, ctxt_id, ompi_mtl_ofi.rx_ctx_bits);
422422

423423
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
424-
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
424+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
425+
return ompi_ret;
426+
}
425427

426428
ofi_req.buffer = (free_after) ? start : NULL;
427429
ofi_req.length = length;
428430
ofi_req.status.MPI_ERROR = OMPI_SUCCESS;
429431
ofi_req.completion_count = 0;
430432

433+
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
434+
opal_show_help("help-mtl-ofi.txt",
435+
"message too big", false,
436+
length, endpoint->mtl_ofi_module->max_msg_size);
437+
return OMPI_ERROR;
438+
}
439+
431440
if (ofi_cq_data) {
432441
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
433442
src_addr = sep_peer_fiaddr;
@@ -553,13 +562,20 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
553562
sep_peer_fiaddr = fi_rx_addr(endpoint->peer_fiaddr, ctxt_id, ompi_mtl_ofi.rx_ctx_bits);
554563

555564
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
556-
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
565+
if (OMPI_UNLIKELY(OMPI_SUCCESS != ompi_ret)) return ompi_ret;
557566

558567
ofi_req->buffer = (free_after) ? start : NULL;
559568
ofi_req->length = length;
560569
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
561570
ofi_req->completion_count = 1;
562571

572+
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
573+
opal_show_help("help-mtl-ofi.txt",
574+
"message too big", false,
575+
length, endpoint->mtl_ofi_module->max_msg_size);
576+
return OMPI_ERROR;
577+
}
578+
563579
if (ofi_cq_data) {
564580
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
565581
} else {

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -881,9 +881,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
881881
}
882882

883883
/**
884-
* Save the maximum inject size.
884+
* Save the maximum sizes.
885885
*/
886886
ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size;
887+
ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size;
887888

888889
/**
889890
* The user is not allowed to exceed MTL_OFI_MAX_PROG_EVENT_COUNT.

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ typedef struct mca_mtl_ofi_module_t {
7070
/** Maximum inject size */
7171
size_t max_inject_size;
7272

73+
/** Largest message that can be sent in a single send. */
74+
size_t max_msg_size;
75+
7376
/** Maximum number of CQ events to read in OFI Progress */
7477
int ofi_progress_event_count;
7578

0 commit comments

Comments
 (0)