1
1
#! /bin/bash
2
2
set -e
3
3
4
- SCRIPT_NAME=$( basename " $0 " )
5
- SCRIPT_PID=$$
6
- PODS_NS_LIST=(openshift-ovn-kubernetes openshift-service-ca openshift-ingress openshift-dns)
7
- PODS_CT_LIST=(2 1 1 2)
8
- LOG_POD_EVENTS=false
9
-
10
4
# Source the MicroShift health check functions library
11
5
# shellcheck source=packaging/greenboot/functions.sh
12
6
source /usr/share/microshift/functions/greenboot.sh
13
7
14
- # Set the term handler to convert exit code to 1
15
- trap ' forced_termination' TERM SIGINT
16
-
17
- # Set the exit handler to log the exit status
18
- trap ' log_script_exit' EXIT
19
-
20
- # Handler that will be called when the script is terminated by sending TERM or
21
- # INT signals. To override default exit codes it forces returning 1 like the
22
- # rest of the error conditions throughout the health check.
23
- function forced_termination() {
24
- echo " Signal received, terminating."
25
- exit 1
26
- }
27
-
28
- # Check preconditions for existence of lvms deployment.
29
- # Adapted from MicroShift code.
30
- #
31
- # args: None
32
- # return: 0 if lvms readiness should be checked, 1 otherwise
33
- function lvmsShouldBeDeployed() {
34
- if ! hash vgs 2> /dev/null; then
35
- return 1
36
- fi
37
- if [ -f /etc/microshift/lvmd.yaml ]; then
38
- return 0
39
- fi
40
- if ! lvmsDriverShouldExist; then
41
- return 1
42
- fi
43
-
44
- local -r volume_groups=$( vgs --readonly --options=name --noheadings)
45
- local -r volume_groups_count=$( echo " ${volume_groups} " | wc -w)
46
- if [ " ${volume_groups_count} " -eq 0 ]; then
47
- return 1
48
- elif [ " ${volume_groups_count} " -eq 1 ]; then
49
- return 0
50
- elif echo " ${volume_groups} " | grep -qw " microshift" ; then
51
- return 0
52
- else
53
- return 1
54
- fi
55
- }
56
-
57
- # Check if MicroShift API 'readyz' and 'livez' health endpoints are OK
58
- #
59
- # args: None
60
- # return: 0 if all API health endpoints are OK, or 1 otherwise
61
- function microshift_health_endpoints_ok() {
62
- local -r check_rd=$( ${OCGET_CMD} --raw=' /readyz?verbose' | awk ' $2 != "ok"' )
63
- local -r check_lv=$( ${OCGET_CMD} --raw=' /livez?verbose' | awk ' $2 != "ok"' )
64
-
65
- [ " ${check_rd} " != " readyz check passed" ] && return 1
66
- [ " ${check_lv} " != " livez check passed" ] && return 1
67
- return 0
68
- }
69
-
70
- # Check if any MicroShift pods are in the 'Running' status
71
- #
72
- # args: None
73
- # return: 0 if any pods are in the 'Running' status, or 1 otherwise
74
- function any_pods_running() {
75
- local -r count=$( ${OCGET_CMD} pods ${OCGET_OPT} -A 2> /dev/null | awk ' $4~/Running/' | wc -l)
76
-
77
- [ " ${count} " -gt 0 ] && return 0
78
- return 1
79
- }
80
-
81
- #
82
- # Main
83
- #
84
-
85
8
# Exit if the current user is not 'root'
86
9
if [ " $( id -u) " -ne 0 ] ; then
87
10
echo " The '${SCRIPT_NAME} ' script must be run with the 'root' user privileges"
@@ -93,102 +16,7 @@ echo "STARTED"
93
16
# Print the boot variable status
94
17
print_boot_status
95
18
96
- # Exit if the MicroShift service is not enabled
97
- # TODO: Remove when `microshift healthcheck` is complete.
98
- if [ " $( systemctl is-enabled microshift.service 2> /dev/null) " != " enabled" ] ; then
99
- echo " MicroShift service is not enabled. Exiting..."
100
- exit 0
101
- fi
102
-
103
19
# Set the wait timeout for the current check based on the boot counter
104
20
WAIT_TIMEOUT_SECS=$( get_wait_timeout)
105
21
106
22
/usr/bin/microshift healthcheck -v=2 --timeout=" ${WAIT_TIMEOUT_SECS} s"
107
-
108
- # Wait for MicroShift API health endpoints to be OK
109
- echo " Waiting ${WAIT_TIMEOUT_SECS} s for MicroShift API health endpoints to be OK"
110
- if ! wait_for " ${WAIT_TIMEOUT_SECS} " microshift_health_endpoints_ok ; then
111
- log_failure_cmd " health-readyz" " ${OCGET_CMD} --raw=/readyz?verbose"
112
- log_failure_cmd " health-livez" " ${OCGET_CMD} --raw=/livez?verbose"
113
-
114
- echo " Error: Timed out waiting for MicroShift API health endpoints to be OK"
115
- exit 1
116
- fi
117
-
118
- if lvmsShouldBeDeployed; then
119
- PODS_NS_LIST+=(openshift-storage)
120
- PODS_CT_LIST+=(2)
121
- fi
122
- declare -a csi_components=(' csi-snapshot-controller' ' csi-snapshot-webhook' )
123
- csi_pods_ct=0
124
- for csi_c in " ${csi_components[@]} " ; do
125
- if csiComponentShouldBeDeployed " ${csi_c} " ; then
126
- (( csi_pods_ct += 1 ))
127
- fi
128
- done
129
- if [ ${csi_pods_ct} -gt 0 ]; then
130
- PODS_NS_LIST+=(kube-system)
131
- PODS_CT_LIST+=(" ${csi_pods_ct} " )
132
- fi
133
-
134
- # Starting pod-specific checks
135
- # Log list of pods and their events on failure
136
- LOG_POD_EVENTS=true
137
-
138
- # Wait for any pods to enter running state
139
- echo " Waiting ${WAIT_TIMEOUT_SECS} s for any pods to be running"
140
- if ! wait_for " ${WAIT_TIMEOUT_SECS} " any_pods_running ; then
141
- echo " Error: Timed out waiting for any MicroShift pod to be running"
142
- exit 1
143
- fi
144
-
145
- # Wait for MicroShift core pod images to be downloaded
146
- for i in " ${! PODS_NS_LIST[@]} " ; do
147
- CHECK_PODS_NS=${PODS_NS_LIST[${i}]}
148
-
149
- echo " Waiting ${WAIT_TIMEOUT_SECS} s for pod image(s) from the '${CHECK_PODS_NS} ' namespace to be downloaded"
150
- if ! wait_for " ${WAIT_TIMEOUT_SECS} " namespace_images_downloaded; then
151
- echo " Error: Timed out waiting for pod image(s) from the '${CHECK_PODS_NS} ' namespace to be downloaded"
152
- exit 1
153
- fi
154
- done
155
-
156
- # Wait for MicroShift core pods to enter ready state
157
- for i in " ${! PODS_NS_LIST[@]} " ; do
158
- CHECK_PODS_NS=${PODS_NS_LIST[${i}]}
159
- CHECK_PODS_CT=${PODS_CT_LIST[${i}]}
160
-
161
- echo " Waiting ${WAIT_TIMEOUT_SECS} s for ${CHECK_PODS_CT} pod(s) from the '${CHECK_PODS_NS} ' namespace to be in 'Ready' state"
162
- if ! wait_for " ${WAIT_TIMEOUT_SECS} " namespace_pods_ready; then
163
- echo " Error: Timed out waiting for ${CHECK_PODS_CT} pod(s) in the '${CHECK_PODS_NS} ' namespace to be in 'Ready' state"
164
- exit 1
165
- fi
166
- done
167
-
168
- # Verify that MicroShift core pods are not restarting
169
- declare -A pid2name
170
- for i in " ${! PODS_NS_LIST[@]} " ; do
171
- CHECK_PODS_NS=${PODS_NS_LIST[${i}]}
172
-
173
- echo " Checking pod restart count in the '${CHECK_PODS_NS} ' namespace"
174
- namespace_pods_not_restarting " ${CHECK_PODS_NS} " &
175
- pid=$!
176
-
177
- pid2name[" ${pid} " ]=" ${CHECK_PODS_NS} "
178
- done
179
-
180
- # Wait for the restart check functions to complete, printing errors in case of a failure
181
- check_failed=false
182
- for pid in " ${! pid2name[@]} " ; do
183
- if ! wait " ${pid} " ; then
184
- check_failed=true
185
-
186
- name=${pid2name["${pid}"]}
187
- echo " Error: Pods are restarting too frequently in the '${name} ' namespace"
188
- fi
189
- done
190
-
191
- # Exit with an error code if the pod restart check failed
192
- if ${check_failed} ; then
193
- exit 1
194
- fi
0 commit comments