Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

ras/lsf: Fix affinity for MPMD jobs running under LSF #1093

Merged
merged 2 commits into from
Apr 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions orte/mca/ras/alps/ras_alps_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename,
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;
node->state = ORTE_NODE_STATE_UP;
/* need to order these node ids so the regex generator
* can properly function
*/
Expand Down Expand Up @@ -585,6 +586,7 @@ orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename,
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = apNodes[ix].numPEs;
node->state = ORTE_NODE_STATE_UP;
/* need to order these node ids so the regex generator
* can properly function
*/
Expand Down
28 changes: 21 additions & 7 deletions orte/mca/ras/lsf/ras_lsf_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -38,6 +39,7 @@
#include "orte/util/show_help.h"

#include "orte/mca/ras/base/ras_private.h"
#include "orte/mca/ras/base/base.h"
#include "ras_lsf.h"


Expand Down Expand Up @@ -98,6 +100,8 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
/* it is a repeat - just bump the slot count */
++node->slots;
opal_output_verbose(10, orte_ras_base_framework.framework_output,
"ras/lsf: +++ Node (%s) [slots=%d]", node->name, node->slots);
continue;
}

Expand All @@ -107,7 +111,11 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;
node->state = ORTE_NODE_STATE_UP;
opal_list_append(nodes, &node->super);

opal_output_verbose(10, orte_ras_base_framework.framework_output,
"ras/lsf: New Node (%s) [slots=%d]", node->name, node->slots);
}

/* release the nodelist from lsf */
Expand Down Expand Up @@ -141,14 +149,20 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
}
/* get the apps and set the hostfile attribute in each to point to
* the hostfile */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
/*
* Do not set the hostfile attribute on each app_context since that
* would confuse the sequential mapper when it tries to assign bindings
* when running an MPMD job.
* Instead just overwrite the orte_default_hostfile so it will be
* general for all of the app_contexts.
*/
if( NULL != orte_default_hostfile ) {
free(orte_default_hostfile);
orte_default_hostfile = NULL;
}
orte_default_hostfile = strdup(affinity_file);
opal_output_verbose(10, orte_ras_base_framework.framework_output,
"ras/lsf: Set default_hostfile to %s",orte_default_hostfile);

return ORTE_SUCCESS;
}
Expand Down
1 change: 1 addition & 0 deletions orte/mca/ras/tm/ras_tm_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ static int discover(opal_list_t* nodelist, char *pbs_jobid)
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = ppn;
node->state = ORTE_NODE_STATE_UP;
opal_list_append(nodelist, &node->super);
} else {

Expand Down