Skip to content

Commit dc857a9

Browse files
author
rhc54
committed
Merge pull request #1267 from rhc54/topic/oob
Standardize the handling of shutdown in the OOB TCP component
2 parents ddf0f27 + 12dccaa commit dc857a9

File tree

2 files changed

+22
-25
lines changed

2 files changed

+22
-25
lines changed

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -970,10 +970,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
970970
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
971971
ORTE_NAME_PRINT(&pop->peer));
972972

973-
/* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */
974-
if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
975-
goto cleanup;
976-
}
973+
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
977974

978975
/* Mark that we no longer support this peer */
979976
memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t));
@@ -987,7 +984,6 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
987984
ORTE_ERROR_LOG(rc);
988985
}
989986

990-
cleanup:
991987
/* activate the proc state */
992988
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
993989
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
@@ -1010,6 +1006,8 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10101006
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10111007
ORTE_NAME_PRINT(&mop->hop));
10121008

1009+
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
1010+
10131011
/* mark that we cannot reach this hop */
10141012
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
10151013
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
@@ -1022,16 +1020,11 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10221020
ORTE_ERROR_LOG(rc);
10231021
}
10241022

1025-
/* report the error back to the OOB and let it try other components
1026-
* or declare a problem
1027-
*/
1028-
if (!orte_finalizing && !orte_abnormal_term_ordered) {
1029-
/* if this was a lifeline, then alert */
1030-
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
1031-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
1032-
} else {
1033-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
1034-
}
1023+
/* if this was a lifeline, then alert */
1024+
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
1025+
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
1026+
} else {
1027+
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
10351028
}
10361029

10371030
OBJ_RELEASE(mop);
@@ -1049,11 +1042,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
10491042
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10501043
ORTE_NAME_PRINT(&mop->hop));
10511044

1052-
if (orte_finalizing || orte_abnormal_term_ordered) {
1053-
/* just ignore the problem */
1054-
OBJ_RELEASE(mop);
1055-
return;
1056-
}
1045+
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
10571046

10581047
/* mark that this component cannot reach this hop */
10591048
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
@@ -1121,11 +1110,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
11211110
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
11221111
ORTE_NAME_PRINT(&pop->peer));
11231112

1124-
/* if we are terminating, then don't attempt to reconnect */
1125-
if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
1126-
OBJ_RELEASE(pop);
1127-
return;
1128-
}
1113+
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
11291114

11301115
/* activate the proc state */
11311116
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,

orte/mca/oob/tcp/oob_tcp_component.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,16 @@ ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_failed_to_connect(int fd, short
9292
ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata);
9393
ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata);
9494

95+
/* provide a macro for handling errors reported during shutdown */
96+
#define MCA_OOB_TCP_CHECK_SHUTDOWN(a) \
97+
do { \
98+
if (!orte_enable_recovery || \
99+
orte_orteds_term_ordered || \
100+
orte_finalizing || \
101+
orte_abnormal_term_ordered) { \
102+
OBJ_RELEASE(a); \
103+
return; \
104+
} \
105+
} while(0);
106+
95107
#endif /* _MCA_OOB_TCP_COMPONENT_H_ */

0 commit comments

Comments
 (0)