Description
Description
In file ompi/communicator/comm.c
, declaration of funcation ompi_comm_split_type
is:
int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
opal_info_t *info, ompi_communicator_t **newcomm)
In the end it may call function ompi_comm_split_type_core
, which performs common processing for it. And the info
seems to be passed to ompi_communicator_t **newcomm
by following code near line 981
:
static int ompi_comm_split_type_core(ompi_communicator_t *comm,
int global_split_type, int local_split_type,
int key, bool need_split, bool no_reorder,
bool no_undefined, opal_info_t *info,
ompi_communicator_t **newcomm)
{
/* ... ... */
ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_LAZY_BARRIER);
ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_ACTIVE_POLL);
if (info) {
opal_infosubscribe_change_info(&newcomp->super, info);
}
/* ... ... */
return rc;
}
It looks like that opal_infosubscribe_change_info
is needed to read all information stored in info
and write it to newcomm
. But info
and newcomm
here are all newly created, in the function opal_infosubscribe_change_info
, &newcomp->super->s_info
is set by opal_info_set
only if the return value of opal_infosubscribe_inform_subscribers
is not NULL
and equlas to a value in info
.
int opal_infosubscribe_change_info(opal_infosubscriber_t *object, opal_info_t *new_info)
{
opal_info_entry_t *iterator;
const char *updated_value;
/* for each key/value in new info, let subscribers know of new value */
int found_callback;
if (!object->s_info) {
object->s_info = OBJ_NEW(opal_info_t);
}
if (NULL != new_info) {
OPAL_LIST_FOREACH (iterator, &new_info->super, opal_info_entry_t) {
int err = OPAL_SUCCESS;
opal_cstring_t *value_str, *key_str;
value_str = iterator->ie_value;
OBJ_RETAIN(value_str);
key_str = iterator->ie_key;
OBJ_RETAIN(key_str);
updated_value = opal_infosubscribe_inform_subscribers(object, iterator->ie_key->string,
iterator->ie_value->string,
&found_callback);
if (NULL != updated_value
&& 0 != strncmp(updated_value, value_str->string, value_str->length)) {
printf("updated_value %s\n", updated_value);
err = opal_info_set(object->s_info, iterator->ie_key->string, updated_value);
}
OBJ_RELEASE(value_str);
OBJ_RELEASE(key_str);
if (OPAL_SUCCESS != err) {
return err;
}
}
}
return OPAL_SUCCESS;
}
In function opal_infosubscribe_inform_subscribers
, variables list
is initialized to NULL
, and then assigned by call to opal_hash_table_get_value_ptr(table, key, strlen(key), (void **) &list);
. As mentioned earlier, newcomm
and info
are all newly created, and maybe some keys do not yet exist in &object->s_subscriber_table
, which make the list
is still NULL
after the opal_hash_table_get_value_ptr
, and the return of this function will be NULL
, too. In this case info
will not be set to newcomm
.
static const char *opal_infosubscribe_inform_subscribers(opal_infosubscriber_t *object,
const char *key, const char *new_value,
int *found_callback)
{
opal_hash_table_t *table = &object->s_subscriber_table;
opal_list_t *list = NULL;
opal_callback_list_item_t *item;
const char *updated_value = NULL;
if (found_callback) {
*found_callback = 0;
}
/*
* Present the new value to each subscriber. They can decide to accept it, ignore it, or
* over-ride it with their own value (like ignore, but they specify what value they want it to
* have).
*
* Since multiple subscribers could set values, only the last setting is kept as the
* returned value.
*/
if (table) {
opal_hash_table_get_value_ptr(table, key, strlen(key), (void **) &list);
if (list) {
updated_value = new_value;
OPAL_LIST_FOREACH (item, list, opal_callback_list_item_t) {
updated_value = item->callback(object, key, updated_value);
if (found_callback) {
*found_callback = 1;
}
}
}
}
return updated_value;
}
In an old verion ompi_comm_split_type
the 'info' is set by the following code, and the change occurred in #9097. I'm not sure whether I'm understanding the code correctly, so I'd like to confirm if this is a bug.
/* ... ... */
// Copy info if there is one.
newcomp->super.s_info = OBJ_NEW(opal_info_t);
if (info) {
opal_info_dup(info, &(newcomp->super.s_info));
}
/* ... ... */
Background information
In fact I'm doing some work on collective communication algorithm optimization, and need to divide the communicator to achieve hierarchical communication. My implementation is similar to mca_coll_han_comm_create_new
in HAN, I encountered the above problem when passing info
to a newly created subcomm by ompi_comm_split_type
.
I was previously developing in ompi version 4.1, and error occurs after I migrated my code to the main branch. After debuging the problem seems to be caused by changes of the way of setting info
in newcomm
.
I also tried to verify if the same problem exists in the HAN component by insert some print into HAN's code:
int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module)
{
/* ... ... */
printf("han start split comm on [%s/%s]\n", ompi_comm_print_cid(comm), comm->c_name);
/* ... ... */
opal_info_set(&comm_info, "ompi_comm_coll_preference", "han");
opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, low_comm);
printf("han low comm [%s/%s]\n", ompi_comm_print_cid(*low_comm), (*low_comm)->c_name);
/* ... ... */
opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE");
ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false);
printf("han up comm [%s/%s]\n", ompi_comm_print_cid(*up_comm), (*up_comm)->c_name);
/* ... ... */
}
mca_coll_base_module_t *
mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
{
/* ... ... */
/* All is good -- return a module */
han_module->topologic_level = GLOBAL_COMMUNICATOR;
printf("1111111 comm [%s/%s]\n", ompi_comm_print_cid(comm), comm->c_name);
if (NULL != comm->super.s_info) {
printf("2222222 comm [%s/%s]\n", ompi_comm_print_cid(comm), comm->c_name);
/* Get the info value disaqualifying coll components */
opal_cstring_t *info_str;
opal_info_get(comm->super.s_info, "ompi_comm_coll_han_topo_level",
&info_str, &flag);
if (flag) {
printf("info_str->string: %s\n", info_str->string);
if (0 == strcmp(info_str->string, "INTER_NODE")) {
han_module->topologic_level = INTER_NODE;
} else {
han_module->topologic_level = INTRA_NODE;
}
OBJ_RELEASE(info_str);
}
}
/* ... ... */
}
It seems that the subcomm also cannot get the value of ompi_comm_coll_han_topo_level
set by ompi_comm_split_type. Some of the output is as follow, they are all in order:
1111111 comm [0/MPI_COMM_WORLD]
***00000
***11111
***22222
han start split comm on [0/MPI_COMM_WORLD]
1111111 comm [3/]
2222222 comm [3/]
han low comm [3/MPI COMM 3 SPLIT_TYPE FROM 0]
1111111 comm [4/MPI COMM 4 SPLIT FROM 0]
2222222 comm [4/MPI COMM 4 SPLIT FROM 0]
info_str->string: INTER_NODE
han up comm [4/MPI COMM 4 SPLIT FROM 0]