Skip to content

Convert the orte_job_data pointer array to a hash table so it doesn't… #1391

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 21, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion opal/mca/pmix/pmix112/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
# Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2015 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
Expand Down
3 changes: 1 addition & 2 deletions orte/mca/errmgr/default_orted/errmgr_default_orted.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -584,7 +584,6 @@ static void proc_errors(int fd, short args, void *cbdata)
orte_session_dir_cleanup(jdata->jobid);

/* remove this job from our local job data since it is complete */
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
OBJ_RELEASE(jdata);

/* send it */
Expand Down
11 changes: 4 additions & 7 deletions orte/mca/ess/base/ess_base_std_orted.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -301,11 +301,8 @@ int orte_ess_base_orted_setup(char **hosts)
}
}
/* setup the global job and node arrays */
orte_job_data = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
1,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
1))) {
orte_job_data = OBJ_NEW(opal_hash_table_t);
if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) {
ORTE_ERROR_LOG(ret);
error = "setup job array";
goto error;
Expand All @@ -332,7 +329,7 @@ int orte_ess_base_orted_setup(char **hosts)
/* create and store the job data object */
jdata = OBJ_NEW(orte_job_t);
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_set_item(orte_job_data, 0, jdata);
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
/* every job requires at least one app */
app = OBJ_NEW(orte_app_context_t);
opal_pointer_array_set_item(jdata->apps, 0, app);
Expand Down
11 changes: 4 additions & 7 deletions orte/mca/ess/hnp/ess_hnp_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -357,11 +357,8 @@ static int rte_init(void)
goto error;
}
/* setup the global job and node arrays */
orte_job_data = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
1,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
1))) {
orte_job_data = OBJ_NEW(opal_hash_table_t);
if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) {
ORTE_ERROR_LOG(ret);
error = "setup job array";
goto error;
Expand All @@ -388,7 +385,7 @@ static int rte_init(void)
/* create and store the job data object */
jdata = OBJ_NEW(orte_job_t);
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_set_item(orte_job_data, 0, jdata);
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
/* mark that the daemons have reported as we are the
* only ones in the system right now, and we definitely
* are running!
Expand Down
54 changes: 22 additions & 32 deletions orte/mca/odls/base/odls_base_default_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
orte_jobid_t job)
{
int rc, i;
int rc;
orte_job_t *jdata=NULL, *jptr;
orte_job_map_t *map=NULL;
opal_buffer_t *wireup, jobdata;
Expand Down Expand Up @@ -191,33 +191,29 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
* properly work should a proc from one of the other jobs
* interact with this one */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) {
void *nptr;
uint32_t key;
OBJ_CONSTRUCT(&jobdata, opal_buffer_t);
numjobs = 0;
for (i=0; i < orte_job_data->size; i++) {
if (NULL == (jptr = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
continue;
}
if (ORTE_JOB_STATE_UNTERMINATED < jptr->state) {
/* job already terminated - ignore it */
continue;
}
if (jptr == jdata) {
/* ignore the job we are looking at - we'll get it separately */
continue;
}
/* pack the job struct */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) {
ORTE_ERROR_LOG(rc);
return rc;
rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr);
while (OPAL_SUCCESS == rc) {
if (NULL != jptr && jptr != jdata &&
ORTE_PROC_MY_NAME->jobid != jptr->jobid) {
/* pack the job struct */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) {
ORTE_ERROR_LOG(rc);
return rc;
}
++numjobs;
}
++numjobs;
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
}
/* pack the number of jobs */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numjobs, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 < numjobs) {
/* pack the number of jobs */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numjobs, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the jobdata buffer */
wireup = &jobdata;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &wireup, 1, OPAL_BUFFER))) {
Expand Down Expand Up @@ -302,7 +298,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
/* check to see if we already have this one */
if (NULL == orte_get_job_data_object(jdata->jobid)) {
/* nope - add it */
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata);
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
/* connect each proc to its node object */
for (j=0; j < jdata->procs->size; j++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
Expand Down Expand Up @@ -401,16 +397,10 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
}
}
goto COMPLETE;
} else {
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
}

if (NULL != orte_get_job_data_object(*job)) {
opal_output(0, "ERROR - JOB ALREADY EXISTS");
/* setup job object for this job */
rc = ORTE_ERR_FATAL;
goto REPORT_ERROR;
}
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata);

/* ensure the map object is present */
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
Expand Down
22 changes: 1 addition & 21 deletions orte/mca/plm/base/plm_base_jobid.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -76,27 +77,6 @@ int orte_plm_base_set_hnp_name(void)
*/
int orte_plm_base_create_jobid(orte_job_t *jdata)
{
#if 0
int32_t j;

/* RHC: WHILE ORTE CAN NOW HANDLE RECYCLING OF JOBID'S,
* THE MPI LAYER CANNOT SINCE THERE IS NO WAY TO
* UPDATE THE OMPI_PROC_T LIST AND/OR THE BTL'S
*/

/* see if there is a prior
* jobid that has completed and can be re-used. It can
* never be 0 as that belongs to the HNP and its daemons
*/
for (j=1; j < orte_job_data->size; j++) {
if (NULL == opal_pointer_array_get_item(orte_job_data, j)) {
/* this local jobid is available - reuse it */
jdata->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, j);
return ORTE_SUCCESS;
}
}
#endif

if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
/* this job is being restarted - do not assign it
* a new jobid
Expand Down
11 changes: 6 additions & 5 deletions orte/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
* the orte_rmaps_base_setup_virtual_machine routine to
* search all apps for any hosts to be used by the vm
*/
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(caddy->jdata->jobid), caddy->jdata);
opal_hash_table_set_value_uint32(orte_job_data, caddy->jdata->jobid, caddy->jdata);
}

/* if job recovery is not enabled, set it to default */
Expand Down Expand Up @@ -1098,18 +1098,19 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
jdatorted->num_reported, jdatorted->num_procs));
if (jdatorted->num_procs == jdatorted->num_reported) {
bool dvm = true;
uint32_t key;
void *nptr;
jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
/* activate the daemons_reported state for all jobs
* whose daemons were launched
*/
for (idx=1; idx < orte_job_data->size; idx++) {
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, idx))) {
continue;
}
rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
while (OPAL_SUCCESS == rc) {
dvm = false;
if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
}
if (dvm) {
/* must be launching a DVM - activate the state */
Expand Down
38 changes: 20 additions & 18 deletions orte/mca/state/base/state_base_fns.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -522,13 +522,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
/* update the proc state */
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
pdata->state = state;
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
/* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally
* and didn't cleanup its own session directory.
*/
orte_session_dir_finalize(proc);
}
}
/* if we are trying to terminate and our routes are
* gone, then terminate ourselves IF no local procs
* remain (might be some from another job)
Expand All @@ -551,11 +551,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
}
/* return the allocated slot for reuse */
cleanup_node(pdata);
/* track job status */
jdata->num_terminated++;
if (jdata->num_terminated == jdata->num_procs) {
/* track job status */
jdata->num_terminated++;
if (jdata->num_terminated == jdata->num_procs) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
}
}
}

cleanup:
Expand All @@ -577,6 +577,8 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
bool one_still_alive;
orte_vpid_t lowest=0;
int32_t i32, *i32ptr;
uint32_t u32;
void *nptr;

opal_output_verbose(2, orte_state_base_framework.framework_output,
"%s state:base:check_job_complete on job %s",
Expand Down Expand Up @@ -722,13 +724,11 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
* object when we find it
*/
one_still_alive = false;
for (j=1; j < orte_job_data->size; j++) {
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
/* since we are releasing jdata objects as we
* go, we can no longer assume that the job_data
* array is left justified
*/
continue;
j = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&job, &nptr);
while (OPAL_SUCCESS == j) {
/* skip the daemon job */
if (job->jobid == ORTE_PROC_MY_NAME->jobid) {
goto next;
}
/* if this is the job we are checking AND it normally terminated,
* then activate the "notify_completed" state - this will release
Expand Down Expand Up @@ -762,20 +762,19 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
/* this was a debugger daemon. notify that a debugger has detached */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
}
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
OBJ_RELEASE(jdata);
}
}
continue;
goto next;
}
/* if the job is flagged to not be monitored, skip it */
if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) {
continue;
goto next;
}
/* when checking for job termination, we must be sure to NOT check
* our own job as it - rather obviously - has NOT terminated!
*/
if (job->num_terminated < job->num_procs) {
if (ORTE_JOB_STATE_NOTIFIED != job->state) {
/* we have at least one job that is not done yet - we cannot
* just return, though, as we need to ensure we cleanout the
* job data for the job that just completed
Expand All @@ -795,7 +794,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
job->num_terminated, job->num_procs,
(NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) ));
}
next:
j = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&job, nptr, &nptr);
}

/* if a job is still alive, we just return */
if (one_still_alive) {
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
Expand Down
Loading