Skip to content

Commit 9c1f649

Browse files
author
Ralph Castain
committed
Fix debugger attach and cospawn of debugger daemons for the STAT debugger. Add ability to test the support minus the actual debugger.
Fixes #2411 Continue cleanup of STAT debugger attach: * Limit the number of times we retry sending of a message to avoid an infinite loop * Don't execute the "init_debugger_after_spawn" state for debugger jobs * Add a new test program "attach" that takes the debugger attach fifo as its argument, and then simulates attach by writing a byte down the fifo Output the attach fifo info if we are testing attach so we know where to attach to - otherwise, use the output_verbose Always send "debugger release" to the job actually being debugged, not the debugger itself Signed-off-by: Ralph Castain <[email protected]> Remove debug Signed-off-by: Ralph Castain <[email protected]>
1 parent f8dae5f commit 9c1f649

File tree

14 files changed

+206
-77
lines changed

14 files changed

+206
-77
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ orte/mca/sstore/orte_sstore.7
341341

342342
orte/test/mpi/abort
343343
orte/test/mpi/accept
344+
orte/test/mpi/attach
344345
orte/test/mpi/bad_exit
345346
orte/test/mpi/bcast_loop
346347
orte/test/mpi/concurrent_spawn

ompi/mca/rte/orte/rte_orte_module.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
33
* All rights reserved.
4-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
4+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
55
* Copyright (c) 2012-2014 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
@@ -104,6 +104,8 @@ void ompi_rte_wait_for_debugger(void)
104104
{
105105
int debugger;
106106
orte_rml_recv_cb_t xfer;
107+
char *evar;
108+
int time;
107109

108110
/* See lengthy comment in orte/tools/orterun/debuggers.c about
109111
orte_in_parallel_debugger */
@@ -123,6 +125,12 @@ void ompi_rte_wait_for_debugger(void)
123125
*/
124126
ompi_debugger_setup_dlls();
125127

128+
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
129+
time = strtol(evar, NULL, 10);
130+
sleep(time);
131+
return;
132+
}
133+
126134
if (orte_standalone_operation) {
127135
/* spin until debugger attaches and releases us */
128136
while (MPIR_debug_gate == 0) {

orte/mca/oob/base/oob_base_stubs.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/*
33
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
5+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
66
* $COPYRIGHT$
77
*
88
* Additional copyrights may follow
@@ -117,9 +117,16 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
117117
* this is a local proc we just haven't heard from
118118
* yet due to a race condition. Check that situation */
119119
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
120-
ORTE_OOB_SEND(msg);
121-
return;
120+
++msg->retries;
121+
if (msg->retries < orte_rml_base.max_retries) {
122+
ORTE_OOB_SEND(msg);
123+
return;
124+
}
122125
}
126+
opal_output_verbose(5, orte_oob_base_framework.framework_output,
127+
"%s CANNOT SEND TO %s: TAG %d",
128+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
129+
ORTE_NAME_PRINT(&msg->dst), msg->tag);
123130
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
124131
ORTE_RML_SEND_COMPLETE(msg);
125132
return;
@@ -396,4 +403,3 @@ static void process_uri(char *uri)
396403
}
397404
opal_argv_free(uris);
398405
}
399-

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2009 Institut National de Recherche en Informatique
1414
* et Automatique. All rights reserved.
1515
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
16-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014-2015 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
1919
* Copyright (c) 2016 IBM Corporation. All rights reserved.
@@ -757,9 +757,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
757757
}
758758

759759
cleanup:
760-
/* need to init_after_spawn for debuggers */
761-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
762-
760+
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
761+
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
762+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
763+
}
763764
OBJ_RELEASE(caddy);
764765
}
765766

orte/mca/rml/base/base.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -84,6 +84,7 @@ ORTE_DECLSPEC void orte_rml_base_comm_stop(void);
8484
typedef struct {
8585
opal_list_t posted_recvs;
8686
opal_list_t unmatched_msgs;
87+
int max_retries;
8788
#if OPAL_ENABLE_TIMING
8889
bool timing;
8990
#endif
@@ -123,6 +124,7 @@ typedef struct {
123124
orte_process_name_t origin;
124125
int status; // returned status on send
125126
orte_rml_tag_t tag; // targeted tag
127+
int retries; // #times we have tried to send it
126128

127129
/* user's send callback functions and data */
128130
union {

orte/mca/rml/base/rml_base_frame.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ static int orte_rml_base_register(mca_base_register_flag_t flags)
6161
&orte_rml_base_wrapper);
6262
(void) mca_base_var_register_synonym(var_id, "orte", "rml",NULL,"wrapper", 0);
6363

64+
orte_rml_base.max_retries = 3;
65+
mca_base_var_register("orte", "rml", "base", "max_retries",
66+
"Max #times to retry sending a message",
67+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
68+
OPAL_INFO_LVL_9,
69+
MCA_BASE_VAR_SCOPE_READONLY,
70+
&orte_rml_base.max_retries);
71+
6472
#if OPAL_ENABLE_TIMING
6573
orte_rml_base.timing = false;
6674
(void) mca_base_var_register ("orte", "rml", "base", "timing",
@@ -259,6 +267,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
259267
/*** RML CLASS INSTANCES ***/
260268
static void send_cons(orte_rml_send_t *ptr)
261269
{
270+
ptr->retries = 0;
262271
ptr->cbdata = NULL;
263272
ptr->iov = NULL;
264273
ptr->buffer = NULL;
@@ -325,4 +334,3 @@ static void prq_des(orte_rml_recv_request_t *ptr)
325334
OBJ_CLASS_INSTANCE(orte_rml_recv_request_t,
326335
opal_object_t,
327336
prq_cons, prq_des);
328-

orte/mca/routed/radix/routed_radix.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* reserved.
77
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
88
* reserved.
9-
* Copyright (c) 2013 Intel, Inc. All rights reserved.
9+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1010
* $COPYRIGHT$
1111
*
1212
* Additional copyrights may follow
@@ -372,6 +372,10 @@ static orte_process_name_t get_route(orte_process_name_t *target)
372372
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
373373
/* find out what daemon hosts this proc */
374374
if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
375+
opal_output_verbose(2, orte_routed_base_framework.framework_output,
376+
"%s ATTEMPTING TO SEND TO %s",
377+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
378+
ORTE_NAME_PRINT(target));
375379
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
376380
ret = ORTE_NAME_INVALID;
377381
goto found;

orte/mca/schizo/base/schizo_base_stubs.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
44
* $COPYRIGHT$
55
*
66
* Additional copyrights may follow
@@ -22,7 +22,6 @@ int orte_schizo_base_parse_cli(char *personality,
2222
orte_schizo_base_active_module_t *mod;
2323

2424
if (NULL == personality) {
25-
opal_output(0, "NULL PERSONALITY");
2625
return ORTE_ERR_NOT_SUPPORTED;
2726
}
2827

@@ -63,6 +62,11 @@ int orte_schizo_base_setup_fork(orte_job_t *jdata,
6362
int rc;
6463
orte_schizo_base_active_module_t *mod;
6564

65+
/* if no personality was specified, then nothing to do */
66+
if (NULL == jdata->personality) {
67+
return ORTE_SUCCESS;
68+
}
69+
6670
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
6771
if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) {
6872
if (NULL != mod->module->setup_fork) {
@@ -81,6 +85,11 @@ int orte_schizo_base_setup_child(orte_job_t *jdata,
8185
int rc;
8286
orte_schizo_base_active_module_t *mod;
8387

88+
/* if no personality was specified, then nothing to do */
89+
if (NULL == jdata->personality) {
90+
return ORTE_SUCCESS;
91+
}
92+
8493
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
8594
if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) {
8695
if (NULL != mod->module->setup_child) {

orte/mca/state/base/state_base_fns.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
3-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
44
* $COPYRIGHT$
55
*
66
* Additional copyrights may follow
@@ -521,13 +521,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
521521
/* update the proc state */
522522
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
523523
pdata->state = state;
524-
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
524+
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
525525
/* Clean up the session directory as if we were the process
526526
* itself. This covers the case where the process died abnormally
527527
* and didn't cleanup its own session directory.
528528
*/
529529
orte_session_dir_finalize(proc);
530-
}
530+
}
531531
/* if we are trying to terminate and our routes are
532532
* gone, then terminate ourselves IF no local procs
533533
* remain (might be some from another job)
@@ -550,11 +550,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
550550
}
551551
/* return the allocated slot for reuse */
552552
cleanup_node(pdata);
553-
/* track job status */
554-
jdata->num_terminated++;
555-
if (jdata->num_terminated == jdata->num_procs) {
553+
/* track job status */
554+
jdata->num_terminated++;
555+
if (jdata->num_terminated == jdata->num_procs) {
556556
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
557-
}
557+
}
558558
}
559559

560560
cleanup:
@@ -752,10 +752,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
752752
* is maintained!
753753
*/
754754
if (1 < j) {
755-
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
756-
/* this was a debugger daemon. notify that a debugger has detached */
757-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
758-
}
755+
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
756+
/* this was a debugger daemon. notify that a debugger has detached */
757+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
758+
}
759759
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
760760
OBJ_RELEASE(jdata);
761761
}

orte/runtime/orte_mca_params.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
1414
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
1515
* All rights reserved
16-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
16+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
1919
* $COPYRIGHT$
@@ -276,7 +276,7 @@ int orte_register_params(void)
276276
"Test debugger colaunch after debugger attachment",
277277
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
278278
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
279-
&orte_debugger_test_daemon);
279+
&orte_debugger_test_attach);
280280

281281
orte_debugger_check_rate = 0;
282282
(void) mca_base_var_register ("orte", "orte", NULL, "debugger_check_rate",

orte/test/mpi/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll
1+
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach
22

33
all: $(PROGS)
44

orte/test/mpi/attach.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/* -*- C -*-
2+
*
3+
* $HEADER$
4+
*
5+
* The most basic of MPI applications
6+
*/
7+
8+
#include <stdio.h>
9+
#include <stdlib.h>
10+
#include <sys/types.h>
11+
#include <sys/stat.h>
12+
#include <fcntl.h>
13+
#include <unistd.h>
14+
15+
int main(int argc, char* argv[])
16+
{
17+
unsigned char fifo_cmd = 1;
18+
int fd;
19+
20+
if (1 > argc) {
21+
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
22+
exit(1);
23+
}
24+
25+
fd = open(argv[1], O_WRONLY);
26+
write(fd, &fifo_cmd, sizeof(unsigned char));
27+
close(fd);
28+
29+
return 0;
30+
}

0 commit comments

Comments
 (0)