Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 1005306

Browse files
committed
Merge pull request #1093 from jjhursey/topic/fix-ras-lsf-v2.x
ras/lsf: Fix affinity for MPMD jobs running under LSF
2 parents ddc9942 + ca41d97 commit 1005306

File tree

3 files changed

+24
-7
lines changed

3 files changed

+24
-7
lines changed

orte/mca/ras/alps/ras_alps_module.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,7 @@ orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename,
549549
node->slots_inuse = 0;
550550
node->slots_max = 0;
551551
node->slots = 1;
552+
node->state = ORTE_NODE_STATE_UP;
552553
/* need to order these node ids so the regex generator
553554
* can properly function
554555
*/
@@ -585,6 +586,7 @@ orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename,
585586
node->slots_inuse = 0;
586587
node->slots_max = 0;
587588
node->slots = apNodes[ix].numPEs;
589+
node->state = ORTE_NODE_STATE_UP;
588590
/* need to order these node ids so the regex generator
589591
* can properly function
590592
*/

orte/mca/ras/lsf/ras_lsf_module.c

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2014 Intel, Inc. All rights reserved
14+
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -38,6 +39,7 @@
3839
#include "orte/util/show_help.h"
3940

4041
#include "orte/mca/ras/base/ras_private.h"
42+
#include "orte/mca/ras/base/base.h"
4143
#include "ras_lsf.h"
4244

4345

@@ -98,6 +100,8 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
98100
if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
99101
/* it is a repeat - just bump the slot count */
100102
++node->slots;
103+
opal_output_verbose(10, orte_ras_base_framework.framework_output,
104+
"ras/lsf: +++ Node (%s) [slots=%d]", node->name, node->slots);
101105
continue;
102106
}
103107

@@ -107,7 +111,11 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
107111
node->slots_inuse = 0;
108112
node->slots_max = 0;
109113
node->slots = 1;
114+
node->state = ORTE_NODE_STATE_UP;
110115
opal_list_append(nodes, &node->super);
116+
117+
opal_output_verbose(10, orte_ras_base_framework.framework_output,
118+
"ras/lsf: New Node (%s) [slots=%d]", node->name, node->slots);
111119
}
112120

113121
/* release the nodelist from lsf */
@@ -141,14 +149,20 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
141149
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
142150
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
143151
}
144-
/* get the apps and set the hostfile attribute in each to point to
145-
* the hostfile */
146-
for (i=0; i < jdata->apps->size; i++) {
147-
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
148-
continue;
149-
}
150-
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
152+
/*
153+
* Do not set the hostfile attribute on each app_context since that
154+
* would confuse the sequential mapper when it tries to assign bindings
155+
* when running an MPMD job.
156+
* Instead just overwrite the orte_default_hostfile so it will be
157+
* general for all of the app_contexts.
158+
*/
159+
if( NULL != orte_default_hostfile ) {
160+
free(orte_default_hostfile);
161+
orte_default_hostfile = NULL;
151162
}
163+
orte_default_hostfile = strdup(affinity_file);
164+
opal_output_verbose(10, orte_ras_base_framework.framework_output,
165+
"ras/lsf: Set default_hostfile to %s",orte_default_hostfile);
152166

153167
return ORTE_SUCCESS;
154168
}

orte/mca/ras/tm/ras_tm_module.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ static int discover(opal_list_t* nodelist, char *pbs_jobid)
212212
node->slots_inuse = 0;
213213
node->slots_max = 0;
214214
node->slots = ppn;
215+
node->state = ORTE_NODE_STATE_UP;
215216
opal_list_append(nodelist, &node->super);
216217
} else {
217218

0 commit comments

Comments
 (0)