Skip to content

Bring forward the debugger-related changes #2476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 30, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ orte/mca/sstore/orte_sstore.7

orte/test/mpi/abort
orte/test/mpi/accept
orte/test/mpi/attach
orte/test/mpi/bad_exit
orte/test/mpi/bcast_loop
orte/test/mpi/concurrent_spawn
Expand Down
10 changes: 9 additions & 1 deletion ompi/mca/rte/orte/rte_orte_module.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2012-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
Expand Down Expand Up @@ -133,6 +133,8 @@ void ompi_rte_wait_for_debugger(void)
int debugger;
opal_list_t *codes;
opal_value_t *kv;
char *evar;
int time;

/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
Expand All @@ -152,6 +154,12 @@ void ompi_rte_wait_for_debugger(void)
*/
ompi_debugger_setup_dlls();

if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
time = strtol(evar, NULL, 10);
sleep(time);
return;
}

if (orte_standalone_operation) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
Expand Down
5 changes: 5 additions & 0 deletions orte/mca/oob/base/oob_base_stubs.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
* this is a local proc we just haven't heard from
* yet due to a race condition. Check that situation */
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
++msg->retries;
if (msg->retries < orte_rml_base.max_retries) {
ORTE_OOB_SEND(msg);
return;
}
ORTE_OOB_SEND(msg);
return;
}
Expand Down
6 changes: 4 additions & 2 deletions orte/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -839,8 +839,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
}

cleanup:
/* need to init_after_spawn for debuggers */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
}

OBJ_RELEASE(caddy);
}
Expand Down
2 changes: 2 additions & 0 deletions orte/mca/rml/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ typedef struct {
opal_pointer_array_t conduits; /* array to hold the open conduits */
opal_list_t posted_recvs;
opal_list_t unmatched_msgs;
int max_retries;
#if OPAL_ENABLE_TIMING
bool timing;
#endif
Expand All @@ -116,6 +117,7 @@ typedef struct {
orte_process_name_t origin;
int status; // returned status on send
orte_rml_tag_t tag; // targeted tag
int retries; // #times we have tried to send it

/* user's send callback functions and data */
union {
Expand Down
9 changes: 9 additions & 0 deletions orte/mca/rml/base/rml_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ static bool selected = false;

static int orte_rml_base_register(mca_base_register_flag_t flags)
{
orte_rml_base.max_retries = 3;
mca_base_var_register("orte", "rml", "base", "max_retries",
"Max #times to retry sending a message",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_rml_base.max_retries);

#if OPAL_ENABLE_TIMING
orte_rml_base.timing = false;
(void) mca_base_var_register ("orte", "rml", "base", "timing",
Expand Down Expand Up @@ -240,6 +248,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
/*** RML CLASS INSTANCES ***/
static void send_cons(orte_rml_send_t *ptr)
{
ptr->retries = 0;
ptr->cbdata = NULL;
ptr->iov = NULL;
ptr->buffer = NULL;
Expand Down
4 changes: 2 additions & 2 deletions orte/mca/schizo/singularity/schizo_singularity.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
bool takeus = false;
char *t2, *pth, *newenv;

if (NULL != orte_schizo_base.personalities) {
if (NULL != orte_schizo_base.personalities &&
NULL != jdata->personality) {
/* see if we are included */
for (i=0; NULL != jdata->personality[i]; i++) {
if (0 == strcmp(jdata->personality[i], "singularity")) {
Expand Down Expand Up @@ -106,4 +107,3 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)

return ORTE_SUCCESS;
}

38 changes: 30 additions & 8 deletions orte/orted/orted_submit.c
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,20 @@ int orte_submit_job(char *argv[], int *index,
}
}

/* check for debugger test envars and forward them if necessary */
if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
char *evar;
evar = getenv("ORTE_TEST_DEBUGGER_SLEEP");
for (i=0; i < (int)jdata->num_apps; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env);
if (NULL != evar) {
opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env);
}
}
}
}

/* check for suicide test directives */
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
Expand Down Expand Up @@ -2149,8 +2163,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
*/
if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
opal_output_verbose(2, orte_debug_output,
"%s No debugger test daemon specified",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
"%s Debugger test daemon specified: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_debugger_test_daemon);
goto launchit;
}
/* if we were given an auto-detect rate, then we want to setup
Expand Down Expand Up @@ -2362,6 +2377,8 @@ static void setup_debugger_job(void)
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = debugger->jobid;
proc->name.vpid = vpid++;
/* point the proc at the local ORTE daemon as its parent */
proc->parent = node->daemon->name.vpid;
/* set the local/node ranks - we don't actually care
* what these are, but the odls needs them
*/
Expand Down Expand Up @@ -2741,7 +2758,7 @@ static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
static void open_fifo(void)
{
if (orte_debugger_attach_fd > 0) {
close(orte_debugger_attach_fd);
close(orte_debugger_attach_fd);
}

orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
Expand All @@ -2760,10 +2777,16 @@ static void open_fifo(void)
return;
}

opal_output_verbose(2, orte_debug_output,
"%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
if (orte_debugger_test_attach) {
opal_output(0, "%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
} else {
opal_output_verbose(2, orte_debug_output,
"%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
}
orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd,
OPAL_EV_READ, attach_debugger, orte_debugger_attach);
Expand Down Expand Up @@ -3232,4 +3255,3 @@ void orte_profile_wakeup(int sd, short args, void *cbdata)
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
}

7 changes: 1 addition & 6 deletions orte/test/mpi/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll iof
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach

all: $(PROGS)

Expand All @@ -10,11 +10,6 @@ hello_output: hello_output.c
hello_show_help: hello_show_help.c
$(CC) $(CFLAGS) $(CFLAGS_INTERNAL) $^ -o $@

hello.sapp: hello.c myhello.spec
$(CC) $(CFLAGS) $(CLAGS_INTERNAL) hello.c -o hello
singularity build myhello.spec
singularity install hello.sapp

CC = mpicc
CFLAGS = -g --openmpi:linkall
CFLAGS_INTERNAL = -I../../.. -I../../../orte/include -I../../../opal/include
Expand Down
30 changes: 30 additions & 0 deletions orte/test/mpi/attach.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/* -*- C -*-
*
* $HEADER$
*
* The most basic of MPI applications
*/

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

int main(int argc, char* argv[])
{
unsigned char fifo_cmd = 1;
int fd;

if (1 > argc) {
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
exit(1);
}

fd = open(argv[1], O_WRONLY);
write(fd, &fifo_cmd, sizeof(unsigned char));
close(fd);

return 0;
}
2 changes: 2 additions & 0 deletions orte/util/error_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "FAULT TOLERANCE RESTART";
case ORTE_JOB_STATE_ANY:
return "ANY";
case ORTE_JOB_STATE_DEBUGGER_DETACH:
return "DEBUGGER DETACH";
default:
return "UNKNOWN STATE!";
}
Expand Down