Skip to content

Commit 1cc6b78

Browse files
committed
osc/rdma: add support for "alternate" btls
This commit updates osc/rdma to support using alternate BTLs when a primary BTL is not available. There may be at most two alternate BTLs in use at any time. The default is selected to cover shared memory (sm) and off-node (tcp). The priority of osc/rdma is a bit lower when using a set of alternate btls. This will allow another osc component to win if there is an alternative. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 0d8140c commit 1cc6b78

File tree

10 files changed

+355
-160
lines changed

10 files changed

+355
-160
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555

5656
#define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count)
5757

58+
#define MCA_OSC_RDMA_MAX_USED_BTLS 2
59+
5860
enum {
5961
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
6062
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
@@ -106,6 +108,9 @@ struct ompi_osc_rdma_component_t {
106108
/** Priority of the osc/rdma component */
107109
unsigned int priority;
108110

111+
/** Priority of the osc/rdma component when using non-RDMA BTLs */
112+
unsigned int alternate_priority;
113+
109114
/** directory where to place backing files */
110115
char *backing_directory;
111116

@@ -251,8 +256,16 @@ struct ompi_osc_rdma_module_t {
251256
opal_mutex_t peer_lock;
252257

253258

254-
/** BTL in use */
255-
struct mca_btl_base_module_t *selected_btl;
259+
/** BTL(s) in use. Currently this is only used to support RDMA emulation over
260+
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
261+
* could be used to support multiple RDMA-capable BTLs but the memory registration
262+
* paths will need to be updated to pack/unpack multiple registration handles. */
263+
struct mca_btl_base_module_t *selected_btls[MCA_OSC_RDMA_MAX_USED_BTLS];
264+
uint8_t btls_in_use;
265+
266+
/** Only true if one BTL is in use. Memory registration is only supported when
267+
* using a single BTL. */
268+
bool use_memory_registration;
256269

257270
/** registered fragment used for locally buffered RDMA transfers */
258271
struct ompi_osc_rdma_frag_t *rdma_frag;
@@ -363,11 +376,11 @@ static inline bool ompi_osc_rdma_in_passive_epoch (ompi_osc_rdma_module_t *modul
363376
static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, void *ptr,
364377
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
365378
{
366-
if (module->selected_btl->btl_register_mem) {
379+
if (module->use_memory_registration) {
367380
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)",
368381
ptr, (void*)((char *) ptr + size), size);
369382

370-
*handle = module->selected_btl->btl_register_mem (module->selected_btl, endpoint, ptr, size, flags);
383+
*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags);
371384
if (OPAL_UNLIKELY(NULL == *handle)) {
372385
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, "
373386
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line);
@@ -385,7 +398,7 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
385398
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
386399
{
387400
if (handle) {
388-
module->selected_btl->btl_deregister_mem (module->selected_btl, handle);
401+
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle);
389402
}
390403
}
391404

@@ -517,7 +530,7 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
517530
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
518531
{
519532
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
520-
return !!(module->selected_btl->btl_flush);
533+
return !!(module->selected_btls[0]->btl_flush);
521534
#else
522535
return false;
523536
#endif
@@ -582,7 +595,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
582595
opal_progress ();
583596
} while (ompi_osc_rdma_sync_get_count (sync));
584597
#else
585-
mca_btl_base_module_t *btl_module = sync->module->selected_btl;
598+
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0];
586599

587600
do {
588601
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
@@ -616,4 +629,9 @@ static inline bool ompi_osc_rdma_oor (int rc)
616629
return (OPAL_SUCCESS != rc && (OPAL_ERR_OUT_OF_RESOURCE == rc || OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc));
617630
}
618631

632+
__opal_attribute_always_inline__
633+
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
634+
return module->selected_btls[btl_index];
635+
}
636+
619637
#endif /* OMPI_OSC_RDMA_H */

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
138138
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
139139
{
140140
ompi_osc_rdma_module_t *module = sync->module;
141-
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
141+
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
142+
int32_t atomic_flags = selected_btl->btl_atomic_flags;
142143
int btl_op, flags;
143144
int64_t origin;
144145

@@ -160,7 +161,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
160161

161162
origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0];
162163

163-
return ompi_osc_rdma_btl_fop (module, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
164+
return ompi_osc_rdma_btl_fop (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
164165
result_addr, true, NULL, NULL, NULL);
165166
}
166167

@@ -182,7 +183,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
182183

183184
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap");
184185

185-
ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, &old_value, 8);
186+
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, &old_value, 8);
186187
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
187188
return ret;
188189
}
@@ -197,7 +198,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
197198
ompi_op_reduce (op, (void *) ((intptr_t) origin_addr + dt->super.true_lb), (void*)((intptr_t) &new_value + offset), 1, dt);
198199
}
199200

200-
ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
201+
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, address, target_handle,
201202
old_value, new_value, 0, (int64_t*)&new_value);
202203
if (OPAL_SUCCESS != ret || new_value == old_value) {
203204
break;
@@ -218,11 +219,12 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
218219
ompi_op_t *op, ompi_osc_rdma_request_t *req)
219220
{
220221
ompi_osc_rdma_module_t *module = sync->module;
221-
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
222+
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
223+
int32_t atomic_flags = selected_btl->btl_atomic_flags;
222224
int btl_op, flags;
223225
int64_t origin;
224226

225-
if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
227+
if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
226228
/* btl put atomics not supported or disabled. fall back on fetch-and-op */
227229
return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
228230
op, req);
@@ -248,7 +250,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
248250
*((int64_t *) origin_addr));
249251

250252
/* if we locked the peer its best to wait for completion before returning */
251-
return ompi_osc_rdma_btl_op (module, peer->data_endpoint, target_address, target_handle, btl_op, origin,
253+
return ompi_osc_rdma_btl_op (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin,
252254
flags, true, NULL, NULL, NULL);
253255
}
254256

@@ -359,7 +361,8 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
359361
/* set up the request */
360362
request->to_free = ptr;
361363

362-
ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, ptr, len);
364+
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint,
365+
target_address, target_handle, ptr, len);
363366
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
364367
return ret;
365368
}
@@ -644,7 +647,8 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
644647
bool lock_acquired)
645648
{
646649
ompi_osc_rdma_module_t *module = sync->module;
647-
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
650+
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
651+
int32_t atomic_flags = btl->btl_atomic_flags;
648652
const size_t size = datatype->super.size;
649653
int64_t compare, source;
650654
int flags, ret;
@@ -660,8 +664,8 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
660664
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%"
661665
PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr));
662666

663-
ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, target_address, target_handle, compare, source, flags,
664-
result_addr);
667+
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle,
668+
compare, source, flags, result_addr);
665669
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
666670
ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
667671
}
@@ -696,6 +700,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
696700
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
697701
{
698702
ompi_osc_rdma_module_t *module = sync->module;
703+
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
699704
unsigned long len = datatype->super.size;
700705
mca_btl_base_registration_handle_t *local_handle = NULL;
701706
ompi_osc_rdma_frag_t *frag = NULL;
@@ -708,7 +713,8 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
708713
", sync %p", len, target_address, (void *) sync);
709714

710715
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl get...");
711-
ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, result_addr, len);
716+
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, target_address,
717+
target_handle, result_addr, len);
712718
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
713719
return ret;
714720
}
@@ -719,7 +725,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
719725
return OMPI_SUCCESS;
720726
}
721727

722-
if (module->selected_btl->btl_register_mem && len > module->selected_btl->btl_put_local_registration_threshold) {
728+
if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) {
723729
do {
724730
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
725731
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
@@ -736,9 +742,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
736742
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put...");
737743

738744
do {
739-
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address,
740-
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
741-
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
745+
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
746+
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
747+
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
742748
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
743749
break;
744750
}

0 commit comments

Comments
 (0)