osc/rdma: do not use local leader optimization for active message RDMA

wzamazon · wzamazon · commit ef6865280d6a · 2021-09-23T20:23:53.000Z
The local leader optimization means that:

  on each node, a process was designated as the local leader,
  who setup shared memory, and other processes on the same
  node would map their states to local leader's shared memory.

  When a process try to update a peer process's state, the
  process will do that through atomic actions on local leader's
  memory. The peer's state is then updated through shard memory.

Essentially, using local leader optimization means two different
channels are used to transfer data and to update peer's update.

This optimization is incorrect for BTL that uses active message RDMA
. Active message RDMA uses send/receive to emulate put and atomics,
and its put implementation does not delivery complete, e.g. when the
initiator got completion for a put action, it only means data has been sent.
it does not mean the data has been delivered to the target buffer.

Therefore, if peer's state is updated through a different communication
channel, it can happen that peer's state is updated before the put
action is completed on the peer, which will cause data corruption.

This patch made the change that: for active message RDMA, peer's state
is updated using the same channel data is transferred (e.g diabling
the local leader optimization).

To achieve that, each process need to have the pointer to each peer's
state, for which this patch introduced a function gather_peer_state().
Note because active message RDMA does not use memory registration,
the state_handle is not gathered.

This patch then sets peer's state pointer using gathered information,
and use the same endpoint to update data and transfer data.

Signed-off-by: Wei Zhang &lt;wzam@amazon.com&gt;
diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h
@@ -255,6 +255,21 @@ struct ompi_osc_rdma_module_t {
     /** lock for peer hash table/array */
     opal_mutex_t peer_lock;
 
+    /** flag to indicate wether to use the local leader optimization,
+     * in which on each node a process was designated as local leader.
+     * local leader setup a shared memory region, and all other processes
+     * on the same node map their state to that region. When a process
+     * want to update a peer's state, the process uses atomics on the peer's
+     * local leader to update peer's state through shared memory region.
+     * BTLs that uses active message RDMA cannot support such optimization,
+     * because active message RDMA uses send/receive to emulate put and
+     * atomics, so the atomcis and RMA operation must be through the same
+     * ordered channel.
+     */
+    bool use_local_leader;
+
+    /** array of peer state. Used when local leader is NOT used */
+    uintptr_t *peer_state_array;
 
     /** BTL(s) in use. Currently this is only used to support RDMA emulation over
      * non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c
@@ -446,6 +446,47 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
     return OMPI_SUCCESS;
 }
 
+/**
+ * @brief gather pointer of module->state and inside the world comm
+ *
+ * This function is used when local leader optimization is NOT used.
+ * In which case, each process communicate with its peer directly
+ * to update peer's state counters (instead of communicating with the peer's
+ * local leader), therefore it need the pointer of peer's state counters, so
+ * it can use atomics to update the counters.
+ *
+ * Note state_handle is not gathered because local leader optimization
+ * is NOT used only when active message RDMA is used, and active message
+ * RDMA does not need memory registration.
+ *
+ * @param	module[in,out]	ompi osc rdma module
+ */
+static int gather_peer_state(ompi_osc_rdma_module_t *module)
+{
+    int ret, handle_size, comm_size;
+
+    comm_size = ompi_comm_size (module->comm);
+
+    module->peer_state_array = calloc(comm_size, sizeof(uintptr_t));
+    if (NULL == module->peer_state_array) {
+        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate memory for module state array!");
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    ret = module->comm->c_coll->coll_allgather(&module->state, sizeof(uintptr_t), MPI_BYTE,
+                                               module->peer_state_array, sizeof(uintptr_t), MPI_BYTE,
+                                               module->comm, module->comm->c_coll->coll_allgather_module);
+    if (OMPI_SUCCESS != ret) {
+        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "module state allgather failed with ompi error code %d", ret);
+        return ret;
+    }
+
+    assert (!module->use_memory_registration);
+
+    return 0;
+}
+
+
 static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size)
 {
     size_t total_size, local_rank_array_size, leader_peer_data_size;
@@ -505,6 +546,13 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
         }
     }
 
+    if (!module->use_local_leader) {
+        ret = gather_peer_state(module);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) {
+            return ret;
+        }
+    }
+
     ret = ompi_osc_rdma_new_peer (module, my_rank, &my_peer);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
         return ret;
@@ -593,9 +641,14 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
     /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */
     module->single_node     = local_size == global_size;
     module->use_cpu_atomics = module->single_node;
-
+    module->use_local_leader = true;
     for (int i = 0 ; i < module->btls_in_use ; ++i) {
         module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
+	/* the usage of local leader means to use different channels to send data to peer and update peer's state.
+	 * When different channels are used, active message RDMA cannot guarantee that put and atomics are completed
+	 * in the same order.
+	 */
+	module->use_local_leader = module->use_local_leader && ! (module->selected_btls[i]->btl_flags &(MCA_BTL_FLAGS_PUT_AM | MCA_BTL_FLAGS_ATOMIC_AM_FOP));
     }
 
     if (1 == local_size) {
@@ -749,6 +802,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
             break;
         }
 
+        if (!module->use_local_leader) {
+            ret = gather_peer_state(module);
+            if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) {
+                break;
+            }
+	}
+
         offset = data_base;
         ompi_osc_rdma_peer_t *local_leader;
         for (int i = 0 ; i < local_size ; ++i) {
@@ -777,17 +837,25 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
                 peer->state = (osc_rdma_counter_t) peer_state;
                 peer->state_endpoint = NULL;
             } else {
-                /* use my endpoint handle to modify the peer's state */
-                if (module->use_memory_registration) {
-                    peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
-                }
-                peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
-                if (i==0) {
+
+		if (module->use_local_leader) {
+                    if (module->use_memory_registration) {
+                        peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
+                    }
+                    peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
+                    if (i==0) {
+                        peer->state_endpoint = peer->data_endpoint;
+                        peer->state_btl_index = peer->data_btl_index;
+                    } else {
+                        peer->state_endpoint = local_leader->state_endpoint;
+                        peer->state_btl_index = local_leader->state_btl_index;
+                    }
+                } else {
+                    assert (!module->use_memory_registration);
+                    assert (NULL != module->peer_state_array);
+                    peer->state = (osc_rdma_counter_t)module->peer_state_array[peer_rank];
                     peer->state_endpoint = peer->data_endpoint;
                     peer->state_btl_index = peer->data_btl_index;
-                } else {
-                    peer->state_endpoint = local_leader->state_endpoint;
-                    peer->state_btl_index = local_leader->state_btl_index;
                 }
             }
 
diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c
@@ -138,49 +138,64 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd
         registration_handle_size = module->selected_btls[0]->btl_registration_handle_size;
     }
 
-    /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
-     * calculates the node and offset the mapping can be found. once the mapping has been read the state
-     * part of the peer structure can be initialized. */
-    node_id = peer->rank / RANK_ARRAY_COUNT(module);
-    array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
-
-    /* the node leader rank is stored in the length field */
-    node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id);
-    array_index = peer->rank % RANK_ARRAY_COUNT(module);
-
-    array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
-
-    /* lookup the btl endpoint needed to retrieve the mapping */
-    ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint);
-    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
-        return OMPI_ERR_UNREACH;
-    }
+    if (module->use_local_leader) {
+        /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
+         * calculates the node and offset the mapping can be found. once the mapping has been read the state
+         * part of the peer structure can be initialized. */
+        node_id = peer->rank / RANK_ARRAY_COUNT(module);
+        array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
+
+        /* the node leader rank is stored in the length field */
+        node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id);
+        array_index = peer->rank % RANK_ARRAY_COUNT(module);
+
+        array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
+
+        /* lookup the btl endpoint needed to retrieve the mapping */
+        ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            return OMPI_ERR_UNREACH;
+        }
 
-    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64
-                     ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data));
+        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64
+                         ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data));
 
-    ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer,
-                                      (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
-                                      &rank_data, sizeof (rank_data));
-    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
-        return ret;
-    }
+        ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer,
+                                          (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
+                                          &rank_data, sizeof (rank_data));
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            return ret;
+        }
 
-    /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
-     * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
-     * of this by re-using the endpoint and pointer stored in the node_comm_info array. */
-    node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
+        /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
+         * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
+         * of this by re-using the endpoint and pointer stored in the node_comm_info array. */
+        node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
 
-    peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
+        peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
 
-    if (registration_handle_size) {
-        peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
-    }
+        if (registration_handle_size) {
+            peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
+        }
 
-    ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id),
-                                           &peer->state_btl_index, &peer->state_endpoint);
-    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
-        return OPAL_ERR_UNREACH;
+        ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id),
+                                               &peer->state_btl_index, &peer->state_endpoint);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            return OPAL_ERR_UNREACH;
+        }
+    } else {
+        assert(NULL != module->peer_stat_array);
+        peer->state = module->peer_state_array[peer->rank];
+
+	assert(!module->use_memory_registration);
+        peer->state_handle = NULL;
+
+	/* when local leader optimization is not used,
+	 * same endpoint were used to transfer data and
+	 * update state
+	 */
+        peer->state_btl_index = peer->data_btl_index;
+        peer->state_endpoint = peer->data_endpoint;
     }
 
     /* nothing more to do for dynamic memory windows */