Skip to content

Commit ce4ad2d

Browse files
authored
Merge pull request #5220 from jsquyres/pr/v3.1.x/fix-finalized-hang
v3.1.x: mpi/finalized: revamp INITIALIZED/FINALIZED
2 parents 7064237 + ac3a441 commit ce4ad2d

21 files changed

+167
-132
lines changed

ompi/errhandler/errhandler.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@@ -193,11 +193,22 @@ struct ompi_request_t;
193193
* This macro directly invokes the ompi_mpi_errors_are_fatal_handler()
194194
* when an error occurs because MPI_COMM_WORLD does not exist (because
195195
* we're before MPI_Init() or after MPI_Finalize()).
196+
*
197+
* NOTE: The ompi_mpi_state variable is a volatile that is set
198+
* atomically in ompi_mpi_init() and ompi_mpi_finalize(). The
199+
* appropriate memory barriers are done in those 2 functions such that
200+
* we do not need to do a read memory barrier here (in
201+
* potentially-performance-critical code paths) before reading the
202+
* variable.
196203
*/
197-
#define OMPI_ERR_INIT_FINALIZE(name) \
198-
if( OPAL_UNLIKELY(!ompi_mpi_initialized || ompi_mpi_finalized) ) { \
199-
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
200-
}
204+
#define OMPI_ERR_INIT_FINALIZE(name) \
205+
{ \
206+
int32_t state = ompi_mpi_state; \
207+
if (OPAL_UNLIKELY(state < OMPI_MPI_STATE_INIT_COMPLETED || \
208+
state > OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT)) { \
209+
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
210+
} \
211+
}
201212

202213
/**
203214
* This is the macro to invoke to directly invoke an MPI error

ompi/errhandler/errhandler_predefined.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2006 University of Houston. All rights reserved.
13-
* Copyright (c) 2008-2013 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2012 Los Alamos National Security, LLC.
@@ -149,7 +149,8 @@ void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
149149

150150
static void out(char *str, char *arg)
151151
{
152-
if (ompi_rte_initialized && !ompi_mpi_finalized) {
152+
if (ompi_rte_initialized &&
153+
ompi_mpi_state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
153154
if (NULL != arg) {
154155
opal_output(0, str, arg);
155156
} else {
@@ -280,7 +281,9 @@ static void backend_fatal_no_aggregate(char *type,
280281
{
281282
char *arg;
282283

283-
assert(!ompi_mpi_initialized || ompi_mpi_finalized);
284+
int32_t state = ompi_mpi_state;
285+
assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
286+
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
284287

285288
fflush(stdout);
286289
fflush(stderr);
@@ -289,7 +292,7 @@ static void backend_fatal_no_aggregate(char *type,
289292

290293
/* Per #2152, print out in plain english if something was invoked
291294
before MPI_INIT* or after MPI_FINALIZE */
292-
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
295+
if (state < OMPI_MPI_STATE_INIT_STARTED) {
293296
if (NULL != arg) {
294297
out("*** The %s() function was called before MPI_INIT was invoked.\n"
295298
"*** This is disallowed by the MPI standard.\n", arg);
@@ -300,7 +303,7 @@ static void backend_fatal_no_aggregate(char *type,
300303
"*** function was invoked, sorry. :-(\n", NULL);
301304
}
302305
out("*** Your MPI job will now abort.\n", NULL);
303-
} else if (ompi_mpi_finalized) {
306+
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
304307
if (NULL != arg) {
305308
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
306309
"*** This is disallowed by the MPI standard.\n", arg);

ompi/mca/coll/fca/coll_fca_ops.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
33
* Copyright (c) 2015 Research Organization for Information Science
44
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -159,7 +160,7 @@ int mca_coll_fca_barrier(struct ompi_communicator_t *comm,
159160
int ret;
160161

161162
FCA_VERBOSE(5,"Using FCA Barrier");
162-
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
163+
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
163164
FCA_VERBOSE(5, "In finalize, reverting to previous barrier");
164165
goto orig_barrier;
165166
}

ompi/mca/coll/hcoll/coll_hcoll_module.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* Copyright (c) 2017 The University of Tennessee and The University
55
* of Tennessee Research Foundation. All rights
66
* reserved.
7+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -241,7 +242,7 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
241242

242243
int mca_coll_hcoll_progress(void)
243244
{
244-
if (ompi_mpi_finalized){
245+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
245246
hcoll_rte_p2p_disabled_notify();
246247
}
247248

ompi/mca/coll/hcoll/coll_hcoll_ops.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
33
Copyright (c) 2015 Research Organization for Information Science
44
and Technology (RIST). All rights reserved.
5+
Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
$COPYRIGHT$
67
78
Additional copyrights may follow
@@ -21,7 +22,7 @@ int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm,
2122
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
2223
HCOL_VERBOSE(20,"RUNNING HCOL BARRIER");
2324

24-
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
25+
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
2526
HCOL_VERBOSE(5, "In finalize, reverting to previous barrier");
2627
goto orig_barrier;
2728
}

ompi/mca/io/romio314/src/io_romio314_file_open.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
1414
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
15+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -69,7 +70,7 @@ mca_io_romio314_file_close (ompi_file_t *fh)
6970
which we obviously can't do if we've started to MPI_Finalize).
7071
The user didn't close the file, so they should expect
7172
unexpected behavior. */
72-
if (ompi_mpi_finalized) {
73+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
7374
return OMPI_SUCCESS;
7475
}
7576

ompi/mca/pml/yalla/pml_yalla.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (C) 2001-2011 Mellanox Technologies Ltd. ALL RIGHTS RESERVED.
33
* Copyright (c) 2015 Research Organization for Information Science
44
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -265,7 +266,7 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
265266
{
266267
size_t i;
267268

268-
if (ompi_mpi_finalized) {
269+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
269270
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
270271
mxm_ep_powerdown(ompi_pml_yalla.mxm_ep);
271272
}

ompi/mpi/c/finalized.c

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
1515
* Copyright (c) 2015 Intel, Inc. All rights reserved
1616
* $COPYRIGHT$
1717
*
@@ -44,13 +44,7 @@ int MPI_Finalized(int *flag)
4444

4545
ompi_hook_base_mpi_finalized_top(flag);
4646

47-
/* We must obtain the lock to guarnatee consistent values of
48-
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
49-
this lock is held for the bulk of the duration of
50-
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
51-
lock, we are guaranteed that some other thread is not part way
52-
through initialization or finalization. */
53-
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
47+
int32_t state = ompi_mpi_state;
5448

5549
if (MPI_PARAM_CHECK) {
5650
if (NULL == flag) {
@@ -59,12 +53,11 @@ int MPI_Finalized(int *flag)
5953
whether we're currently (after MPI_Init and before
6054
MPI_Finalize) or not */
6155

62-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
63-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
56+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
57+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6458
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6559
FUNC_NAME);
6660
} else {
67-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
6861
/* We have no MPI object here so call ompi_errhandle_invoke
6962
* directly */
7063
return ompi_errhandler_invoke(NULL, NULL, -1,
@@ -74,8 +67,7 @@ int MPI_Finalized(int *flag)
7467
}
7568
}
7669

77-
*flag = ompi_mpi_finalized;
78-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
70+
*flag = (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
7971

8072
ompi_hook_base_mpi_finalized_bottom(flag);
8173

ompi/mpi/c/get_library_version.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
1313
* Copyright (c) 2015 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* Copyright (c) 2015 Intel, Inc. All rights reserved
@@ -58,7 +58,9 @@ int MPI_Get_library_version(char *version, int *resultlen)
5858
(i.e., use a NULL communicator, which will end up at the
5959
default errhandler, which is abort). */
6060

61-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
61+
int32_t state = ompi_mpi_state;
62+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
63+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6264
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6365
FUNC_NAME);
6466
} else {

ompi/mpi/c/get_version.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
1414
* Copyright (c) 2015 Intel, Inc. All rights reserved
15+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -54,7 +55,9 @@ int MPI_Get_version(int *version, int *subversion)
5455
(i.e., use a NULL communicator, which will end up at the
5556
default errhandler, which is abort). */
5657

57-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
58+
int32_t state = ompi_mpi_state;
59+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
60+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
5861
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
5962
FUNC_NAME);
6063
} else {

ompi/mpi/c/init.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
1313
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
1414
* Copyright (c) 2015 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
@@ -63,9 +63,9 @@ int MPI_Init(int *argc, char ***argv)
6363
don't lose anything) */
6464

6565
if (NULL != argc && NULL != argv) {
66-
err = ompi_mpi_init(*argc, *argv, required, &provided);
66+
err = ompi_mpi_init(*argc, *argv, required, &provided, false);
6767
} else {
68-
err = ompi_mpi_init(0, NULL, required, &provided);
68+
err = ompi_mpi_init(0, NULL, required, &provided, false);
6969
}
7070

7171
/* Since we don't have a communicator to invoke an errorhandler on

ompi/mpi/c/init_thread.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2010 Oak Ridge National Labs. All rights reserved.
1414
* Copyright (c) 2015 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
16-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
16+
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
1717
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
1818
* reserved.
1919
* $COPYRIGHT$
@@ -63,9 +63,9 @@ int MPI_Init_thread(int *argc, char ***argv, int required,
6363
don't lose anything) */
6464

6565
if (NULL != argc && NULL != argv) {
66-
err = ompi_mpi_init(*argc, *argv, required, provided);
66+
err = ompi_mpi_init(*argc, *argv, required, provided, false);
6767
} else {
68-
err = ompi_mpi_init(0, NULL, required, provided);
68+
err = ompi_mpi_init(0, NULL, required, provided, false);
6969
}
7070

7171
/* Since we don't have a communicator to invoke an errorhandler on

ompi/mpi/c/initialized.c

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
1515
* Copyright (c) 2015 Intel, Inc. All rights reserved
1616
* $COPYRIGHT$
1717
*
@@ -44,13 +44,7 @@ int MPI_Initialized(int *flag)
4444

4545
ompi_hook_base_mpi_initialized_top(flag);
4646

47-
/* We must obtain the lock to guarnatee consistent values of
48-
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
49-
this lock is held for the bulk of the duration of
50-
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
51-
lock, we are guaranteed that some other thread is not part way
52-
through initialization or finalization. */
53-
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
47+
int32_t state = ompi_mpi_state;
5448

5549
if (MPI_PARAM_CHECK) {
5650
if (NULL == flag) {
@@ -59,12 +53,11 @@ int MPI_Initialized(int *flag)
5953
whether we're currently (after MPI_Init and before
6054
MPI_Finalize) or not */
6155

62-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
63-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
56+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
57+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6458
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6559
FUNC_NAME);
6660
} else {
67-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
6861
/* We have no MPI object here so call ompi_errhandle_invoke
6962
* directly */
7063
return ompi_errhandler_invoke(NULL, NULL, -1,
@@ -74,8 +67,7 @@ int MPI_Initialized(int *flag)
7467
}
7568
}
7669

77-
*flag = ompi_mpi_initialized;
78-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
70+
*flag = (state >= OMPI_MPI_STATE_INIT_COMPLETED);
7971

8072
ompi_hook_base_mpi_initialized_bottom(flag);
8173

ompi/mpi/tool/finalize.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/*
33
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
5+
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
66
* Copyright (c) 2017 IBM Corporation. All rights reserved.
77
* $COPYRIGHT$
88
*
@@ -39,7 +39,9 @@ int MPI_T_finalize (void)
3939
if (0 == --ompi_mpit_init_count) {
4040
(void) ompi_info_close_components ();
4141

42-
if ((!ompi_mpi_initialized || ompi_mpi_finalized) &&
42+
int32_t state = ompi_mpi_state;
43+
if ((state < OMPI_MPI_STATE_INIT_COMPLETED ||
44+
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) &&
4345
(NULL != ompi_mpi_main_thread)) {
4446
/* we are not between MPI_Init and MPI_Finalize so we
4547
* have to free the ompi_mpi_main_thread */

ompi/peruse/peruse.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* reserved.
55
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
66
* University of Stuttgart. All rights reserved.
7+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -64,8 +65,11 @@ const int PERUSE_num_events = (sizeof(PERUSE_events) / sizeof(peruse_event_assoc
6465
int PERUSE_Init (void)
6566
{
6667
if (MPI_PARAM_CHECK) {
67-
if (!ompi_mpi_initialized || ompi_mpi_finalized)
68+
int32_t state = ompi_mpi_state;
69+
if (state < OMPI_MPI_STATE_INIT_COMPLETED ||
70+
state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
6871
return PERUSE_ERR_INIT;
72+
}
6973
}
7074
ompi_peruse_init ();
7175
return PERUSE_SUCCESS;

0 commit comments

Comments
 (0)