Skip to content

Commit 61aca9e

Browse files
committed
osc/rdma: Load all btls as alternate btls
See lengthy comment in the change, but this patch removes the ability of users to specify a subset of available btls for use by the osc rdma component. The BTL interface was never designed for such usage (which is why there is no similar option for the OB1 PML) and it had clear places where it broke, so remove it. Signed-off-by: Brian Barrett <[email protected]>
1 parent b6b16a6 commit 61aca9e

File tree

1 file changed

+71
-58
lines changed

1 file changed

+71
-58
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 71 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "ompi_config.h"
3636

3737
#include <string.h>
38+
#include <stdlib.h>
3839

3940
#include "osc_rdma.h"
4041
#include "osc_rdma_frag.h"
@@ -84,7 +85,6 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
8485
static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value);
8586

8687
static char *ompi_osc_rdma_full_connectivity_btls;
87-
static char *ompi_osc_rdma_btl_alternate_names;
8888

8989
static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = {
9090
{.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"},
@@ -257,14 +257,6 @@ static int ompi_osc_rdma_component_register (void)
257257
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_full_connectivity_btls);
258258
free(description_str);
259259

260-
ompi_osc_rdma_btl_alternate_names = "sm,tcp";
261-
opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying "
262-
"connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names);
263-
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str,
264-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
265-
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names);
266-
free(description_str);
267-
268260
if (0 == access ("/dev/shm", W_OK)) {
269261
mca_osc_rdma_component.backing_directory = "/dev/shm";
270262
} else {
@@ -875,76 +867,97 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
875867
free(procs);
876868
}
877869

870+
871+
/*
872+
* qsort() sorting function for ompi_osc_rdma_query_alternate_btls(),
873+
* using latency as the sorting metric.
874+
*/
875+
static int btl_latency_sort_fn(const void *a, const void *b)
876+
{
877+
const struct mca_btl_base_module_t *btl_a = a;
878+
const struct mca_btl_base_module_t *btl_b = b;
879+
880+
if (btl_a->btl_latency < btl_b->btl_latency) {
881+
return -1;
882+
} else if (btl_a->btl_latency == btl_b->btl_latency) {
883+
return 0;
884+
} else {
885+
return 1;
886+
}
887+
}
888+
889+
878890
/**
879891
* @brief query for alternate BTLs
880892
*
881893
* @in comm Communicator to query
882-
* @out module OSC module to store BTLs/count to (optional)
883-
* @out
894+
* @inout module OSC module to store BTLs/count to (optional)
884895
*
885896
* @return OMPI_SUCCESS if BTLs can be found
886897
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
887898
*
888-
* In this case an "alternate" BTL is a BTL does not meet the
889-
* requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls().
890-
* Either it does not provide connectivity to all peers, provide
891-
* remote completion, or natively support put/get/atomic.. Since more
892-
* than one BTL may be needed for this support the OSC component will
893-
* disable the use of registration-based RDMA (these BTLs will not be
894-
* used) and will use any remaining BTL. By default the BTLs used will
895-
* be tcp and sm but any single (or pair) of BTLs may be used.
899+
* We directly use the active message rdma wrappers for alternate
900+
* BTLs, in all cases. This greatly simplifies the alternate BTL
901+
* impementation, at the expense of some performance. With the
902+
* AM wrappers, we can always enforce remote completion and the lack
903+
* of memory registration, at some performance cost. But we can use
904+
* as many BTLs as we like. The module's btl list is sorted by
905+
* latency, so that ompi_osc_rdma_peer_btl_endpoint() picks the lowest
906+
* available latency btl to communicate with the peer. Unlike the OB1
907+
* PML, we only use one BTL per peer.
908+
*
909+
* Like the OB1 PML, there is no verification that there is at least
910+
* one BTL that can communicate with every other peer in the window.
896911
*/
897912
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
898913
{
899914
mca_btl_base_selected_module_t *item;
900-
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
901-
int btls_found = 0;
902-
903-
btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
904-
if (NULL == btls_to_use) {
905-
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
906-
"no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
907-
return OMPI_ERR_UNREACH;
908-
}
915+
int ret;
909916

910-
if (module) {
911-
module->btls_in_use = 0;
917+
/* shortcut the trivial query case */
918+
if (NULL == module) {
919+
if (opal_list_is_empty(&mca_btl_base_modules_initialized)) {
920+
return OMPI_ERR_UNREACH;
921+
}
922+
return OMPI_SUCCESS;
912923
}
913924

914-
/* rdma and atomics are only supported with BTLs at the moment */
915-
for (int i = 0 ; btls_to_use[i] ; ++i) {
916-
opal_output_verbose(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]);
917-
OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
918-
if (NULL != item->btl_module->btl_register_mem) {
919-
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
920-
"skipping RDMA btl when searching for alternate BTL");
921-
continue;
922-
}
923-
924-
if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) {
925-
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
926-
"skipping btl %s",
927-
item->btl_module->btl_component->btl_version.mca_component_name);
928-
continue;
929-
}
930-
931-
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
932-
"found alternate btl %s", btls_to_use[i]);
933-
934-
++btls_found;
935-
if (module) {
936-
mca_btl_base_am_rdma_init(item->btl_module);
937-
ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++);
938-
}
939-
925+
module->btls_in_use = 0;
926+
927+
/* add all alternate btls to the selected_btls list, not worrying
928+
about ordering yet. We have to add all btls unless we want to
929+
iterate over all endpoints to build the minimum set of btls
930+
needed to communicate with all peers. An MCA parameter just
931+
for osc rdma also wouldn't work, as the BML can decide not to
932+
add an endpoint for a btl given the priority of another btl.
933+
For example, it is not uncommon that the only endpoint created
934+
to a peer on the same host is the sm btl's endpoint. If we
935+
had an osc rdma specific parameter list, and the user
936+
specified a combination not including sm, that would result in
937+
an eventual failure, as no btl would be found to talk to ranks
938+
on the same host.*/
939+
OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
940+
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
941+
"found alternate btl %s",
942+
item->btl_module->btl_component->btl_version.mca_component_name);
943+
ret = mca_btl_base_am_rdma_init(item->btl_module);
944+
if (OMPI_SUCCESS != ret) {
945+
return ret;
940946
}
947+
ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++);
941948
}
942949

943-
opal_argv_free (btls_to_use);
950+
/* sort based on latency, lowest first */
951+
qsort(module->selected_btls, module->btls_in_use,
952+
sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn);
944953

945-
return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH;
954+
/* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */
955+
module->use_memory_registration = false;
956+
957+
return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH;
946958
}
947959

960+
948961
/* Check for BTL requirements:
949962
* 1) RDMA (put/get) and ATOMIC operations. We only require cswap
950963
* and fetch and add and will emulate other opterations with those

0 commit comments

Comments
 (0)