@@ -970,10 +970,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
970
970
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
971
971
ORTE_NAME_PRINT (& pop -> peer ));
972
972
973
- /* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */
974
- if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered ) {
975
- goto cleanup ;
976
- }
973
+ MCA_OOB_TCP_CHECK_SHUTDOWN (pop );
977
974
978
975
/* Mark that we no longer support this peer */
979
976
memcpy (& ui64 , (char * )& pop -> peer , sizeof (uint64_t ));
@@ -987,7 +984,6 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
987
984
ORTE_ERROR_LOG (rc );
988
985
}
989
986
990
- cleanup :
991
987
/* activate the proc state */
992
988
if (ORTE_SUCCESS != orte_routed .route_lost (& pop -> peer )) {
993
989
ORTE_ACTIVATE_PROC_STATE (& pop -> peer , ORTE_PROC_STATE_LIFELINE_LOST );
@@ -1010,6 +1006,8 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
1010
1006
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1011
1007
ORTE_NAME_PRINT (& mop -> hop ));
1012
1008
1009
+ MCA_OOB_TCP_CHECK_SHUTDOWN (mop );
1010
+
1013
1011
/* mark that we cannot reach this hop */
1014
1012
memcpy (& ui64 , (char * )& (mop -> hop ), sizeof (uint64_t ));
1015
1013
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64 (& orte_oob_base .peers ,
@@ -1022,16 +1020,11 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
1022
1020
ORTE_ERROR_LOG (rc );
1023
1021
}
1024
1022
1025
- /* report the error back to the OOB and let it try other components
1026
- * or declare a problem
1027
- */
1028
- if (!orte_finalizing && !orte_abnormal_term_ordered ) {
1029
- /* if this was a lifeline, then alert */
1030
- if (ORTE_SUCCESS != orte_routed .route_lost (& mop -> hop )) {
1031
- ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_LIFELINE_LOST );
1032
- } else {
1033
- ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_COMM_FAILED );
1034
- }
1023
+ /* if this was a lifeline, then alert */
1024
+ if (ORTE_SUCCESS != orte_routed .route_lost (& mop -> hop )) {
1025
+ ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_LIFELINE_LOST );
1026
+ } else {
1027
+ ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_COMM_FAILED );
1035
1028
}
1036
1029
1037
1030
OBJ_RELEASE (mop );
@@ -1049,11 +1042,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
1049
1042
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1050
1043
ORTE_NAME_PRINT (& mop -> hop ));
1051
1044
1052
- if (orte_finalizing || orte_abnormal_term_ordered ) {
1053
- /* just ignore the problem */
1054
- OBJ_RELEASE (mop );
1055
- return ;
1056
- }
1045
+ MCA_OOB_TCP_CHECK_SHUTDOWN (mop );
1057
1046
1058
1047
/* mark that this component cannot reach this hop */
1059
1048
memcpy (& ui64 , (char * )& (mop -> hop ), sizeof (uint64_t ));
@@ -1121,11 +1110,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
1121
1110
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1122
1111
ORTE_NAME_PRINT (& pop -> peer ));
1123
1112
1124
- /* if we are terminating, then don't attempt to reconnect */
1125
- if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered ) {
1126
- OBJ_RELEASE (pop );
1127
- return ;
1128
- }
1113
+ MCA_OOB_TCP_CHECK_SHUTDOWN (pop );
1129
1114
1130
1115
/* activate the proc state */
1131
1116
opal_output_verbose (OOB_TCP_DEBUG_CONNECT , orte_oob_base_framework .framework_output ,
0 commit comments