Skip to content

Commit ef68652

Browse files
committed
osc/rdma: do not use local leader optimization for active message RDMA
The local leader optimization means that: on each node, a process was designated as the local leader, who setup shared memory, and other processes on the same node would map their states to local leader's shared memory. When a process try to update a peer process's state, the process will do that through atomic actions on local leader's memory. The peer's state is then updated through shard memory. Essentially, using local leader optimization means two different channels are used to transfer data and to update peer's update. This optimization is incorrect for BTL that uses active message RDMA . Active message RDMA uses send/receive to emulate put and atomics, and its put implementation does not delivery complete, e.g. when the initiator got completion for a put action, it only means data has been sent. it does not mean the data has been delivered to the target buffer. Therefore, if peer's state is updated through a different communication channel, it can happen that peer's state is updated before the put action is completed on the peer, which will cause data corruption. This patch made the change that: for active message RDMA, peer's state is updated using the same channel data is transferred (e.g diabling the local leader optimization). To achieve that, each process need to have the pointer to each peer's state, for which this patch introduced a function gather_peer_state(). Note because active message RDMA does not use memory registration, the state_handle is not gathered. This patch then sets peer's state pointer using gathered information, and use the same endpoint to update data and transfer data. Signed-off-by: Wei Zhang <[email protected]>
1 parent 71b121c commit ef68652

File tree

3 files changed

+145
-47
lines changed

3 files changed

+145
-47
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,21 @@ struct ompi_osc_rdma_module_t {
255255
/** lock for peer hash table/array */
256256
opal_mutex_t peer_lock;
257257

258+
/** flag to indicate wether to use the local leader optimization,
259+
* in which on each node a process was designated as local leader.
260+
* local leader setup a shared memory region, and all other processes
261+
* on the same node map their state to that region. When a process
262+
* want to update a peer's state, the process uses atomics on the peer's
263+
* local leader to update peer's state through shared memory region.
264+
* BTLs that uses active message RDMA cannot support such optimization,
265+
* because active message RDMA uses send/receive to emulate put and
266+
* atomics, so the atomcis and RMA operation must be through the same
267+
* ordered channel.
268+
*/
269+
bool use_local_leader;
270+
271+
/** array of peer state. Used when local leader is NOT used */
272+
uintptr_t *peer_state_array;
258273

259274
/** BTL(s) in use. Currently this is only used to support RDMA emulation over
260275
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 78 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,47 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
446446
return OMPI_SUCCESS;
447447
}
448448

449+
/**
450+
* @brief gather pointer of module->state and inside the world comm
451+
*
452+
* This function is used when local leader optimization is NOT used.
453+
* In which case, each process communicate with its peer directly
454+
* to update peer's state counters (instead of communicating with the peer's
455+
* local leader), therefore it need the pointer of peer's state counters, so
456+
* it can use atomics to update the counters.
457+
*
458+
* Note state_handle is not gathered because local leader optimization
459+
* is NOT used only when active message RDMA is used, and active message
460+
* RDMA does not need memory registration.
461+
*
462+
* @param module[in,out] ompi osc rdma module
463+
*/
464+
static int gather_peer_state(ompi_osc_rdma_module_t *module)
465+
{
466+
int ret, handle_size, comm_size;
467+
468+
comm_size = ompi_comm_size (module->comm);
469+
470+
module->peer_state_array = calloc(comm_size, sizeof(uintptr_t));
471+
if (NULL == module->peer_state_array) {
472+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate memory for module state array!");
473+
return OMPI_ERR_OUT_OF_RESOURCE;
474+
}
475+
476+
ret = module->comm->c_coll->coll_allgather(&module->state, sizeof(uintptr_t), MPI_BYTE,
477+
module->peer_state_array, sizeof(uintptr_t), MPI_BYTE,
478+
module->comm, module->comm->c_coll->coll_allgather_module);
479+
if (OMPI_SUCCESS != ret) {
480+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "module state allgather failed with ompi error code %d", ret);
481+
return ret;
482+
}
483+
484+
assert (!module->use_memory_registration);
485+
486+
return 0;
487+
}
488+
489+
449490
static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size)
450491
{
451492
size_t total_size, local_rank_array_size, leader_peer_data_size;
@@ -505,6 +546,13 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
505546
}
506547
}
507548

549+
if (!module->use_local_leader) {
550+
ret = gather_peer_state(module);
551+
if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) {
552+
return ret;
553+
}
554+
}
555+
508556
ret = ompi_osc_rdma_new_peer (module, my_rank, &my_peer);
509557
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
510558
return ret;
@@ -593,9 +641,14 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
593641
/* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */
594642
module->single_node = local_size == global_size;
595643
module->use_cpu_atomics = module->single_node;
596-
644+
module->use_local_leader = true;
597645
for (int i = 0 ; i < module->btls_in_use ; ++i) {
598646
module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
647+
/* the usage of local leader means to use different channels to send data to peer and update peer's state.
648+
* When different channels are used, active message RDMA cannot guarantee that put and atomics are completed
649+
* in the same order.
650+
*/
651+
module->use_local_leader = module->use_local_leader && ! (module->selected_btls[i]->btl_flags &(MCA_BTL_FLAGS_PUT_AM | MCA_BTL_FLAGS_ATOMIC_AM_FOP));
599652
}
600653

601654
if (1 == local_size) {
@@ -749,6 +802,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
749802
break;
750803
}
751804

805+
if (!module->use_local_leader) {
806+
ret = gather_peer_state(module);
807+
if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) {
808+
break;
809+
}
810+
}
811+
752812
offset = data_base;
753813
ompi_osc_rdma_peer_t *local_leader;
754814
for (int i = 0 ; i < local_size ; ++i) {
@@ -777,17 +837,25 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
777837
peer->state = (osc_rdma_counter_t) peer_state;
778838
peer->state_endpoint = NULL;
779839
} else {
780-
/* use my endpoint handle to modify the peer's state */
781-
if (module->use_memory_registration) {
782-
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
783-
}
784-
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
785-
if (i==0) {
840+
841+
if (module->use_local_leader) {
842+
if (module->use_memory_registration) {
843+
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
844+
}
845+
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
846+
if (i==0) {
847+
peer->state_endpoint = peer->data_endpoint;
848+
peer->state_btl_index = peer->data_btl_index;
849+
} else {
850+
peer->state_endpoint = local_leader->state_endpoint;
851+
peer->state_btl_index = local_leader->state_btl_index;
852+
}
853+
} else {
854+
assert (!module->use_memory_registration);
855+
assert (NULL != module->peer_state_array);
856+
peer->state = (osc_rdma_counter_t)module->peer_state_array[peer_rank];
786857
peer->state_endpoint = peer->data_endpoint;
787858
peer->state_btl_index = peer->data_btl_index;
788-
} else {
789-
peer->state_endpoint = local_leader->state_endpoint;
790-
peer->state_btl_index = local_leader->state_btl_index;
791859
}
792860
}
793861

ompi/mca/osc/rdma/osc_rdma_peer.c

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -138,49 +138,64 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd
138138
registration_handle_size = module->selected_btls[0]->btl_registration_handle_size;
139139
}
140140

141-
/* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
142-
* calculates the node and offset the mapping can be found. once the mapping has been read the state
143-
* part of the peer structure can be initialized. */
144-
node_id = peer->rank / RANK_ARRAY_COUNT(module);
145-
array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
146-
147-
/* the node leader rank is stored in the length field */
148-
node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id);
149-
array_index = peer->rank % RANK_ARRAY_COUNT(module);
150-
151-
array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
152-
153-
/* lookup the btl endpoint needed to retrieve the mapping */
154-
ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint);
155-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
156-
return OMPI_ERR_UNREACH;
157-
}
141+
if (module->use_local_leader) {
142+
/* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
143+
* calculates the node and offset the mapping can be found. once the mapping has been read the state
144+
* part of the peer structure can be initialized. */
145+
node_id = peer->rank / RANK_ARRAY_COUNT(module);
146+
array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
147+
148+
/* the node leader rank is stored in the length field */
149+
node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id);
150+
array_index = peer->rank % RANK_ARRAY_COUNT(module);
151+
152+
array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
153+
154+
/* lookup the btl endpoint needed to retrieve the mapping */
155+
ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint);
156+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
157+
return OMPI_ERR_UNREACH;
158+
}
158159

159-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64
160-
", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data));
160+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64
161+
", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data));
161162

162-
ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer,
163-
(mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
164-
&rank_data, sizeof (rank_data));
165-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
166-
return ret;
167-
}
163+
ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer,
164+
(mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
165+
&rank_data, sizeof (rank_data));
166+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
167+
return ret;
168+
}
168169

169-
/* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
170-
* every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
171-
* of this by re-using the endpoint and pointer stored in the node_comm_info array. */
172-
node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
170+
/* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
171+
* every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
172+
* of this by re-using the endpoint and pointer stored in the node_comm_info array. */
173+
node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
173174

174-
peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
175+
peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
175176

176-
if (registration_handle_size) {
177-
peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
178-
}
177+
if (registration_handle_size) {
178+
peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
179+
}
179180

180-
ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id),
181-
&peer->state_btl_index, &peer->state_endpoint);
182-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
183-
return OPAL_ERR_UNREACH;
181+
ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id),
182+
&peer->state_btl_index, &peer->state_endpoint);
183+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
184+
return OPAL_ERR_UNREACH;
185+
}
186+
} else {
187+
assert(NULL != module->peer_stat_array);
188+
peer->state = module->peer_state_array[peer->rank];
189+
190+
assert(!module->use_memory_registration);
191+
peer->state_handle = NULL;
192+
193+
/* when local leader optimization is not used,
194+
* same endpoint were used to transfer data and
195+
* update state
196+
*/
197+
peer->state_btl_index = peer->data_btl_index;
198+
peer->state_endpoint = peer->data_endpoint;
184199
}
185200

186201
/* nothing more to do for dynamic memory windows */

0 commit comments

Comments
 (0)