Skip to content

Commit 1a2a4dd

Browse files
authored
Merge pull request #5217 from jsquyres/pr/v2.x/fix-finalized-hang
v2.x: Fix MPI_FINALIZED_HANG
2 parents a563cae + 30533ba commit 1a2a4dd

21 files changed

+194
-135
lines changed

ompi/errhandler/errhandler.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@@ -192,11 +192,22 @@ struct ompi_request_t;
192192
* This macro directly invokes the ompi_mpi_errors_are_fatal_handler()
193193
* when an error occurs because MPI_COMM_WORLD does not exist (because
194194
* we're before MPI_Init() or after MPI_Finalize()).
195+
*
196+
* NOTE: The ompi_mpi_state variable is a volatile that is set
197+
* atomically in ompi_mpi_init() and ompi_mpi_finalize(). The
198+
* appropriate memory barriers are done in those 2 functions such that
199+
* we do not need to do a read memory barrier here (in
200+
* potentially-performance-critical code paths) before reading the
201+
* variable.
195202
*/
196-
#define OMPI_ERR_INIT_FINALIZE(name) \
197-
if( OPAL_UNLIKELY(!ompi_mpi_initialized || ompi_mpi_finalized) ) { \
198-
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
199-
}
203+
#define OMPI_ERR_INIT_FINALIZE(name) \
204+
{ \
205+
int32_t state = ompi_mpi_state; \
206+
if (OPAL_UNLIKELY(state < OMPI_MPI_STATE_INIT_COMPLETED || \
207+
state > OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT)) { \
208+
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
209+
} \
210+
}
200211

201212
/**
202213
* This is the macro to invoke to directly invoke an MPI error

ompi/errhandler/errhandler_predefined.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2006 University of Houston. All rights reserved.
13-
* Copyright (c) 2008-2013 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2012 Los Alamos National Security, LLC.
@@ -149,7 +149,7 @@ void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
149149

150150
static void out(char *str, char *arg)
151151
{
152-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
152+
if (ompi_mpi_state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
153153
if (NULL != arg) {
154154
opal_output(0, str, arg);
155155
} else {
@@ -190,7 +190,9 @@ static void backend_fatal_aggregate(char *type,
190190
const char* usable_prefix = unknown_prefix;
191191
const char* usable_err_msg = unknown_error;
192192

193-
assert(ompi_mpi_initialized && !ompi_mpi_finalized);
193+
int32_t state = ompi_mpi_state;
194+
assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
195+
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
194196

195197
arg = va_arg(arglist, char*);
196198
va_end(arglist);
@@ -282,7 +284,9 @@ static void backend_fatal_no_aggregate(char *type,
282284
{
283285
char *arg;
284286

285-
assert(!ompi_mpi_initialized || ompi_mpi_finalized);
287+
int32_t state = ompi_mpi_state;
288+
assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
289+
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
286290

287291
fflush(stdout);
288292
fflush(stderr);
@@ -291,7 +295,7 @@ static void backend_fatal_no_aggregate(char *type,
291295

292296
/* Per #2152, print out in plain english if something was invoked
293297
before MPI_INIT* or after MPI_FINALIZE */
294-
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
298+
if (state < OMPI_MPI_STATE_INIT_STARTED) {
295299
if (NULL != arg) {
296300
out("*** The %s() function was called before MPI_INIT was invoked.\n"
297301
"*** This is disallowed by the MPI standard.\n", arg);
@@ -302,7 +306,7 @@ static void backend_fatal_no_aggregate(char *type,
302306
"*** function was invoked, sorry. :-(\n", NULL);
303307
}
304308
out("*** Your MPI job will now abort.\n", NULL);
305-
} else if (ompi_mpi_finalized) {
309+
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
306310
if (NULL != arg) {
307311
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
308312
"*** This is disallowed by the MPI standard.\n", arg);
@@ -377,7 +381,9 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
377381
{
378382
/* We only want aggregation after MPI_INIT and before
379383
MPI_FINALIZE. */
380-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
384+
int32_t state = ompi_mpi_state;
385+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
386+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
381387
backend_fatal_aggregate(type, comm, name, error_code, arglist);
382388
} else {
383389
backend_fatal_no_aggregate(type, comm, name, error_code, arglist);

ompi/mca/coll/fca/coll_fca_ops.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
33
* Copyright (c) 2015 Research Organization for Information Science
44
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -159,7 +160,7 @@ int mca_coll_fca_barrier(struct ompi_communicator_t *comm,
159160
int ret;
160161

161162
FCA_VERBOSE(5,"Using FCA Barrier");
162-
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
163+
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
163164
FCA_VERBOSE(5, "In finalize, reverting to previous barrier");
164165
goto orig_barrier;
165166
}

ompi/mca/coll/hcoll/coll_hcoll_module.c

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
/**
2-
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
3-
Copyright (c) 2016 IBM Corporation. All rights reserved.
4-
$COPYRIGHT$
5-
6-
Additional copyrights may follow
7-
8-
$HEADER$
2+
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
3+
* Copyright (c) 2016 IBM Corporation. All rights reserved.
4+
* Copyright (c) 2017 The University of Tennessee and The University
5+
* of Tennessee Research Foundation. All rights
6+
* reserved.
7+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
8+
* $COPYRIGHT$
9+
*
10+
* Additional copyrights may follow
11+
*
12+
* $HEADER$
913
*/
1014

1115
#include "ompi_config.h"
@@ -238,7 +242,7 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
238242

239243
int mca_coll_hcoll_progress(void)
240244
{
241-
if (ompi_mpi_finalized){
245+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
242246
hcoll_rte_p2p_disabled_notify();
243247
}
244248

ompi/mca/coll/hcoll/coll_hcoll_ops.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
33
Copyright (c) 2015 Research Organization for Information Science
44
and Technology (RIST). All rights reserved.
5+
Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
$COPYRIGHT$
67
78
Additional copyrights may follow
@@ -21,7 +22,7 @@ int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm,
2122
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
2223
HCOL_VERBOSE(20,"RUNNING HCOL BARRIER");
2324

24-
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
25+
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
2526
HCOL_VERBOSE(5, "In finalize, reverting to previous barrier");
2627
goto orig_barrier;
2728
}

ompi/mca/io/romio314/src/io_romio314_file_open.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14+
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
15+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
1416
* $COPYRIGHT$
1517
*
1618
* Additional copyrights may follow
@@ -58,7 +60,7 @@ mca_io_romio314_file_close (ompi_file_t *fh)
5860
which we obviously can't do if we've started to MPI_Finalize).
5961
The user didn't close the file, so they should expect
6062
unexpected behavior. */
61-
if (ompi_mpi_finalized) {
63+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6264
return OMPI_SUCCESS;
6365
}
6466

ompi/mca/pml/yalla/pml_yalla.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* and Technology (RIST). All rights reserved.
55
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
66
* reserved.
7+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -268,8 +269,8 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
268269
{
269270
size_t i;
270271

271-
if (ompi_mpi_finalized) {
272-
PML_YALLA_VERBOSE(3, "using bulk powerdown");
272+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
273+
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
273274
mxm_ep_powerdown(ompi_pml_yalla.mxm_ep);
274275
}
275276

ompi/mpi/c/finalized.c

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
15+
* Copyright (c) 2015 Intel, Inc. All rights reserved
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -40,13 +41,7 @@ int MPI_Finalized(int *flag)
4041
{
4142
MPI_Comm null = NULL;
4243

43-
/* We must obtain the lock to guarnatee consistent values of
44-
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
45-
this lock is held for the bulk of the duration of
46-
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
47-
lock, we are guaranteed that some other thread is not part way
48-
through initialization or finalization. */
49-
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
44+
int32_t state = ompi_mpi_state;
5045

5146
if (MPI_PARAM_CHECK) {
5247
if (NULL == flag) {
@@ -55,20 +50,21 @@ int MPI_Finalized(int *flag)
5550
whether we're currently (after MPI_Init and before
5651
MPI_Finalize) or not */
5752

58-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
59-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
53+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
54+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6055
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6156
FUNC_NAME);
6257
} else {
63-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
64-
return OMPI_ERRHANDLER_INVOKE(null, MPI_ERR_ARG,
58+
/* We have no MPI object here so call ompi_errhandle_invoke
59+
* directly */
60+
return ompi_errhandler_invoke(NULL, NULL, -1,
61+
ompi_errcode_get_mpi_code(MPI_ERR_ARG),
6562
FUNC_NAME);
6663
}
6764
}
6865
}
6966

70-
*flag = ompi_mpi_finalized;
71-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
67+
*flag = (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
7268

7369
return MPI_SUCCESS;
7470
}

ompi/mpi/c/get_library_version.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
1313
* Copyright (c) 2015 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* $COPYRIGHT$
@@ -56,7 +56,9 @@ int MPI_Get_library_version(char *version, int *resultlen)
5656
(i.e., use a NULL communicator, which will end up at the
5757
default errhandler, which is abort). */
5858

59-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
59+
int32_t state = ompi_mpi_state;
60+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
61+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6062
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6163
FUNC_NAME);
6264
} else {

ompi/mpi/c/get_version.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14+
* Copyright (c) 2015 Intel, Inc. All rights reserved
15+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
1416
* $COPYRIGHT$
1517
*
1618
* Additional copyrights may follow
@@ -53,7 +55,9 @@ int MPI_Get_version(int *version, int *subversion)
5355
(i.e., use a NULL communicator, which will end up at the
5456
default errhandler, which is abort). */
5557

56-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
58+
int32_t state = ompi_mpi_state;
59+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
60+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
5761
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
5862
FUNC_NAME);
5963
} else {

ompi/mpi/c/init.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
1313
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
1414
* Copyright (c) 2015 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
@@ -63,9 +63,9 @@ int MPI_Init(int *argc, char ***argv)
6363
don't lose anything) */
6464

6565
if (NULL != argc && NULL != argv) {
66-
err = ompi_mpi_init(*argc, *argv, required, &provided);
66+
err = ompi_mpi_init(*argc, *argv, required, &provided, false);
6767
} else {
68-
err = ompi_mpi_init(0, NULL, required, &provided);
68+
err = ompi_mpi_init(0, NULL, required, &provided, false);
6969
}
7070

7171
/* Since we don't have a communicator to invoke an errorhandler on

ompi/mpi/c/init_thread.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
* Copyright (c) 2010 Oak Ridge National Labs. All rights reserved.
1313
* Copyright (c) 2015 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
15+
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
16+
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
17+
* reserved.
1618
* $COPYRIGHT$
1719
*
1820
* Additional copyrights may follow
@@ -66,9 +68,9 @@ int MPI_Init_thread(int *argc, char ***argv, int required,
6668
don't lose anything) */
6769

6870
if (NULL != argc && NULL != argv) {
69-
err = ompi_mpi_init(*argc, *argv, required, provided);
71+
err = ompi_mpi_init(*argc, *argv, required, provided, false);
7072
} else {
71-
err = ompi_mpi_init(0, NULL, required, provided);
73+
err = ompi_mpi_init(0, NULL, required, provided, false);
7274
}
7375

7476
/* Since we don't have a communicator to invoke an errorhandler on

ompi/mpi/c/initialized.c

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
15+
* Copyright (c) 2015 Intel, Inc. All rights reserved
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -40,13 +41,7 @@ int MPI_Initialized(int *flag)
4041
{
4142
MPI_Comm null = NULL;
4243

43-
/* We must obtain the lock to guarnatee consistent values of
44-
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
45-
this lock is held for the bulk of the duration of
46-
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
47-
lock, we are guaranteed that some other thread is not part way
48-
through initialization or finalization. */
49-
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
44+
int32_t state = ompi_mpi_state;
5045

5146
if (MPI_PARAM_CHECK) {
5247
if (NULL == flag) {
@@ -55,20 +50,21 @@ int MPI_Initialized(int *flag)
5550
whether we're currently (after MPI_Init and before
5651
MPI_Finalize) or not */
5752

58-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
59-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
53+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
54+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6055
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6156
FUNC_NAME);
6257
} else {
63-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
64-
return OMPI_ERRHANDLER_INVOKE(null, MPI_ERR_ARG,
58+
/* We have no MPI object here so call ompi_errhandle_invoke
59+
* directly */
60+
return ompi_errhandler_invoke(NULL, NULL, -1,
61+
ompi_errcode_get_mpi_code(MPI_ERR_ARG),
6562
FUNC_NAME);
6663
}
6764
}
6865
}
6966

70-
*flag = ompi_mpi_initialized;
71-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
67+
*flag = (state >= OMPI_MPI_STATE_INIT_COMPLETED);
7268

7369
return MPI_SUCCESS;
7470
}

0 commit comments

Comments
 (0)