Skip to content

Commit ca41d97

Browse files
committed
ras/lsf: Fix affinity for MPMD jobs running under LSF
(cherry picked from commit 29b4935)
1 parent fe228e6 commit ca41d97

File tree

1 file changed

+20
-7
lines changed

1 file changed

+20
-7
lines changed

orte/mca/ras/lsf/ras_lsf_module.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2014 Intel, Inc. All rights reserved
14+
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -38,6 +39,7 @@
3839
#include "orte/util/show_help.h"
3940

4041
#include "orte/mca/ras/base/ras_private.h"
42+
#include "orte/mca/ras/base/base.h"
4143
#include "ras_lsf.h"
4244

4345

@@ -98,6 +100,8 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
98100
if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
99101
/* it is a repeat - just bump the slot count */
100102
++node->slots;
103+
opal_output_verbose(10, orte_ras_base_framework.framework_output,
104+
"ras/lsf: +++ Node (%s) [slots=%d]", node->name, node->slots);
101105
continue;
102106
}
103107

@@ -109,6 +113,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
109113
node->slots = 1;
110114
node->state = ORTE_NODE_STATE_UP;
111115
opal_list_append(nodes, &node->super);
116+
117+
opal_output_verbose(10, orte_ras_base_framework.framework_output,
118+
"ras/lsf: New Node (%s) [slots=%d]", node->name, node->slots);
112119
}
113120

114121
/* release the nodelist from lsf */
@@ -142,14 +149,20 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
142149
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
143150
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
144151
}
145-
/* get the apps and set the hostfile attribute in each to point to
146-
* the hostfile */
147-
for (i=0; i < jdata->apps->size; i++) {
148-
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
149-
continue;
150-
}
151-
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
152+
/*
153+
* Do not set the hostfile attribute on each app_context since that
154+
* would confuse the sequential mapper when it tries to assign bindings
155+
* when running an MPMD job.
156+
* Instead just overwrite the orte_default_hostfile so it will be
157+
* general for all of the app_contexts.
158+
*/
159+
if( NULL != orte_default_hostfile ) {
160+
free(orte_default_hostfile);
161+
orte_default_hostfile = NULL;
152162
}
163+
orte_default_hostfile = strdup(affinity_file);
164+
opal_output_verbose(10, orte_ras_base_framework.framework_output,
165+
"ras/lsf: Set default_hostfile to %s",orte_default_hostfile);
153166

154167
return ORTE_SUCCESS;
155168
}

0 commit comments

Comments
 (0)