|
35 | 35 | #include "ompi_config.h"
|
36 | 36 |
|
37 | 37 | #include <string.h>
|
| 38 | +#include <stdlib.h> |
38 | 39 |
|
39 | 40 | #include "osc_rdma.h"
|
40 | 41 | #include "osc_rdma_frag.h"
|
@@ -84,7 +85,6 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
|
84 | 85 | static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value);
|
85 | 86 |
|
86 | 87 | static char *ompi_osc_rdma_full_connectivity_btls;
|
87 |
| -static char *ompi_osc_rdma_btl_alternate_names; |
88 | 88 |
|
89 | 89 | static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = {
|
90 | 90 | {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"},
|
@@ -257,14 +257,6 @@ static int ompi_osc_rdma_component_register (void)
|
257 | 257 | MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_full_connectivity_btls);
|
258 | 258 | free(description_str);
|
259 | 259 |
|
260 |
| - ompi_osc_rdma_btl_alternate_names = "sm,tcp"; |
261 |
| - opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying " |
262 |
| - "connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names); |
263 |
| - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str, |
264 |
| - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, |
265 |
| - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); |
266 |
| - free(description_str); |
267 |
| - |
268 | 260 | if (0 == access ("/dev/shm", W_OK)) {
|
269 | 261 | mca_osc_rdma_component.backing_directory = "/dev/shm";
|
270 | 262 | } else {
|
@@ -875,76 +867,97 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
|
875 | 867 | free(procs);
|
876 | 868 | }
|
877 | 869 |
|
| 870 | + |
| 871 | +/* |
| 872 | + * qsort() sorting function for ompi_osc_rdma_query_alternate_btls(), |
| 873 | + * using latency as the sorting metric. |
| 874 | + */ |
| 875 | +static int btl_latency_sort_fn(const void *a, const void *b) |
| 876 | +{ |
| 877 | + const struct mca_btl_base_module_t *btl_a = a; |
| 878 | + const struct mca_btl_base_module_t *btl_b = b; |
| 879 | + |
| 880 | + if (btl_a->btl_latency < btl_b->btl_latency) { |
| 881 | + return -1; |
| 882 | + } else if (btl_a->btl_latency == btl_b->btl_latency) { |
| 883 | + return 0; |
| 884 | + } else { |
| 885 | + return 1; |
| 886 | + } |
| 887 | +} |
| 888 | + |
| 889 | + |
878 | 890 | /**
|
879 | 891 | * @brief query for alternate BTLs
|
880 | 892 | *
|
881 | 893 | * @in comm Communicator to query
|
882 |
| - * @out module OSC module to store BTLs/count to (optional) |
883 |
| - * @out |
| 894 | + * @inout module OSC module to store BTLs/count to (optional) |
884 | 895 | *
|
885 | 896 | * @return OMPI_SUCCESS if BTLs can be found
|
886 | 897 | * @return OMPI_ERR_UNREACH if no BTLs can be found that match
|
887 | 898 | *
|
888 |
| - * In this case an "alternate" BTL is a BTL does not meet the |
889 |
| - * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls(). |
890 |
| - * Either it does not provide connectivity to all peers, provide |
891 |
| - * remote completion, or natively support put/get/atomic.. Since more |
892 |
| - * than one BTL may be needed for this support the OSC component will |
893 |
| - * disable the use of registration-based RDMA (these BTLs will not be |
894 |
| - * used) and will use any remaining BTL. By default the BTLs used will |
895 |
| - * be tcp and sm but any single (or pair) of BTLs may be used. |
| 899 | + * We directly use the active message rdma wrappers for alternate |
| 900 | + * BTLs, in all cases. This greatly simplifies the alternate BTL |
| 901 | + * impementation, at the expense of some performance. With the |
| 902 | + * AM wrappers, we can always enforce remote completion and the lack |
| 903 | + * of memory registration, at some performance cost. But we can use |
| 904 | + * as many BTLs as we like. The module's btl list is sorted by |
| 905 | + * latency, so that ompi_osc_rdma_peer_btl_endpoint() picks the lowest |
| 906 | + * available latency btl to communicate with the peer. Unlike the OB1 |
| 907 | + * PML, we only use one BTL per peer. |
| 908 | + * |
| 909 | + * Like the OB1 PML, there is no verification that there is at least |
| 910 | + * one BTL that can communicate with every other peer in the window. |
896 | 911 | */
|
897 | 912 | static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
|
898 | 913 | {
|
899 | 914 | mca_btl_base_selected_module_t *item;
|
900 |
| - char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); |
901 |
| - int btls_found = 0; |
902 |
| - |
903 |
| - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); |
904 |
| - if (NULL == btls_to_use) { |
905 |
| - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
906 |
| - "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); |
907 |
| - return OMPI_ERR_UNREACH; |
908 |
| - } |
| 915 | + int ret; |
909 | 916 |
|
910 |
| - if (module) { |
911 |
| - module->btls_in_use = 0; |
| 917 | + /* shortcut the trivial query case */ |
| 918 | + if (NULL == module) { |
| 919 | + if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { |
| 920 | + return OMPI_ERR_UNREACH; |
| 921 | + } |
| 922 | + return OMPI_SUCCESS; |
912 | 923 | }
|
913 | 924 |
|
914 |
| - /* rdma and atomics are only supported with BTLs at the moment */ |
915 |
| - for (int i = 0 ; btls_to_use[i] ; ++i) { |
916 |
| - opal_output_verbose(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); |
917 |
| - OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { |
918 |
| - if (NULL != item->btl_module->btl_register_mem) { |
919 |
| - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
920 |
| - "skipping RDMA btl when searching for alternate BTL"); |
921 |
| - continue; |
922 |
| - } |
923 |
| - |
924 |
| - if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { |
925 |
| - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
926 |
| - "skipping btl %s", |
927 |
| - item->btl_module->btl_component->btl_version.mca_component_name); |
928 |
| - continue; |
929 |
| - } |
930 |
| - |
931 |
| - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
932 |
| - "found alternate btl %s", btls_to_use[i]); |
933 |
| - |
934 |
| - ++btls_found; |
935 |
| - if (module) { |
936 |
| - mca_btl_base_am_rdma_init(item->btl_module); |
937 |
| - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); |
938 |
| - } |
939 |
| - |
| 925 | + module->btls_in_use = 0; |
| 926 | + |
| 927 | + /* add all alternate btls to the selected_btls list, not worrying |
| 928 | + about ordering yet. We have to add all btls unless we want to |
| 929 | + iterate over all endpoints to build the minimum set of btls |
| 930 | + needed to communicate with all peers. An MCA parameter just |
| 931 | + for osc rdma also wouldn't work, as the BML can decide not to |
| 932 | + add an endpoint for a btl given the priority of another btl. |
| 933 | + For example, it is not uncommon that the only endpoint created |
| 934 | + to a peer on the same host is the sm btl's endpoint. If we |
| 935 | + had an osc rdma specific parameter list, and the user |
| 936 | + specified a combination not including sm, that would result in |
| 937 | + an eventual failure, as no btl would be found to talk to ranks |
| 938 | + on the same host.*/ |
| 939 | + OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { |
| 940 | + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
| 941 | + "found alternate btl %s", |
| 942 | + item->btl_module->btl_component->btl_version.mca_component_name); |
| 943 | + ret = mca_btl_base_am_rdma_init(item->btl_module); |
| 944 | + if (OMPI_SUCCESS != ret) { |
| 945 | + return ret; |
940 | 946 | }
|
| 947 | + ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); |
941 | 948 | }
|
942 | 949 |
|
943 |
| - opal_argv_free (btls_to_use); |
| 950 | + /* sort based on latency, lowest first */ |
| 951 | + qsort(module->selected_btls, module->btls_in_use, |
| 952 | + sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); |
944 | 953 |
|
945 |
| - return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; |
| 954 | + /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */ |
| 955 | + module->use_memory_registration = false; |
| 956 | + |
| 957 | + return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; |
946 | 958 | }
|
947 | 959 |
|
| 960 | + |
948 | 961 | /* Check for BTL requirements:
|
949 | 962 | * 1) RDMA (put/get) and ATOMIC operations. We only require cswap
|
950 | 963 | * and fetch and add and will emulate other opterations with those
|
|
0 commit comments