Skip to content

Commit 4a5a89c

Browse files
authored
fix: doris scale in (#2233)
1 parent 5a9c967 commit 4a5a89c

File tree

3 files changed

+43
-19
lines changed

3 files changed

+43
-19
lines changed

addons/doris/scripts/fe/fe_member_leave.sh

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,22 @@
33
set +x
44
set -o errexit
55

6+
7+
function info() {
8+
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*"
9+
}
10+
11+
info "Start to leave FE member"
12+
613
leader_host=""
714
leave_member_host=""
815
leave_member_port=""
916
leave_role=""
10-
helper_endpoints=""
17+
# always use 0 pod FQDN as helper_endpoints
18+
helper_endpoints=$(echo "$POD_FQDN_LIST" | cut -d, -f1)
19+
helper_pod_name=$(echo "$helper_endpoints" | cut -d: -f1 | cut -d. -f1)
1120
candidate_names=""
1221

13-
function info() {
14-
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*"
15-
}
1622

1723
# root@x-fe-0:/opt/starrocks# mysql -h 127.0.0.1 -P 9030 -e "show frontends"
1824
## +-----------------------------------------+------------------------------------------------------+-------------+----------+-----------+---------+--------------------+----------+----------+-----------+------+-------+-------------------+---------------------+---------------------+----------+--------+-----------------------------+------------------+
@@ -24,10 +30,23 @@ function info() {
2430
# +-----------------------------------------+------------------------------------------------------+-------------+----------+-----------+---------+--------------------+----------+----------+-----------+------+-------+-------------------+---------------------+---------------------+----------+--------+-----------------------------+------------------+
2531

2632
function show_frontends() {
27-
mysql -N -B -h "${FE_DISCOVERY_ADDR}" -P 9030 -u"${DORIS_USER}" -p"${DORIS_PASSWORD}" -e "show frontends"
33+
local retry_count=0
34+
local max_retries=20
35+
local retry_interval=6
36+
while (( retry_count < max_retries )); do
37+
if mysql -N -B -h "${FE_DISCOVERY_ADDR}" -P 9030 -u"${DORIS_USER}" -p"${DORIS_PASSWORD}" -e "show frontends"; then
38+
return 0
39+
fi
40+
retry_count=$((retry_count + 1))
41+
info "Failed to execute 'show frontends', retrying in ${retry_interval} seconds... (${retry_count}/${max_retries})" >&2
42+
sleep ${retry_interval}
43+
done
44+
info "Failed to execute 'show frontends' after ${max_retries} retries." >&2
45+
exit 1
2846
}
2947

3048
function switch_leader() {
49+
info "switch leader from ${leader_host} to ${candidate_names}, address:${helper_endpoints}"
3150
java -jar /opt/apache-doris/fe/lib/je-18.3.14-doris-SNAPSHOT.jar DbGroupAdmin -helperHosts "${helper_endpoints}" -groupName PALO_JOURNAL_GROUP -transferMaster -force "${candidate_names}" 5000
3251
}
3352

@@ -50,24 +69,18 @@ while IFS= read -r line; do
5069
edit_log_port=$(echo "$line" | awk '{print $3}')
5170
role=$(echo "$line" | awk '{print $8}')
5271
is_master=$(echo "$line" | awk '{print $9}')
53-
is_leaving=False
5472
if [[ ${ip} == ${KB_LEAVE_MEMBER_POD_NAME}* ]]; then
55-
is_leaving=True
5673
leave_member_host=${ip}
5774
leave_member_port=${edit_log_port}
5875
leave_role=${role}
5976
fi
6077
if [ "${is_master}" == "true" ]; then
6178
leader_host=${ip}
6279
fi
63-
if [ "${is_leaving}" == "False" ] && [ "${role}" == "FOLLOWER" ]; then
64-
if [ -n "${helper_endpoints}" ]; then
65-
helper_endpoints=${helper_endpoints},${ip}:${edit_log_port}
66-
candidate_names=${candidate_names},${name}
67-
else
68-
helper_endpoints=${ip}:${edit_log_port}
69-
candidate_names=${name}
70-
fi
80+
81+
if [[ ${ip} == "${helper_endpoints}" ]]; then
82+
candidate_names=${name}
83+
helper_endpoints=${ip}:${edit_log_port}
7184
fi
7285
done <<< "$output"
7386

@@ -82,10 +95,11 @@ if [ -z "${leave_member_host}" ] || [ -z "${leave_member_port}" ]; then
8295
exit 0
8396
fi
8497

85-
# The leader will exit if lost it's leader role
86-
if [[ ${leader_host} == ${KB_LEAVE_MEMBER_POD_NAME}* ]]; then
98+
if [[ ${KB_AGENT_POD_NAME} != ${helper_pod_name} ]]; then
8799
switch_leader
88100
wait_for_leader_switched
89101
fi
90102

91103
mysql -h "${leader_host}" -u"${DORIS_USER}" -p"${DORIS_PASSWORD}" -P 9030 -e "alter system drop ${leave_role} '${leave_member_host}:${leave_member_port}';"
104+
105+
info "leave member ${leave_member_host}:${leave_member_port} success"

addons/doris/scripts/fe/init_fe.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ readonly RETRY_INTERVAL=3
2626
readonly FE_CONFIG_FILE="${DORIS_HOME}/fe/conf/fe.conf"
2727
readonly FOLLOWER_NUMBER=3
2828
readonly BACKUP_DIR="${DORIS_HOME}/fe/doris-meta/ape/backup"
29+
2930
export DATE="$(date +%Y%m%d-%H%M%S)"
3031

3132
cp /etc/config/fe.conf ${FE_CONFIG_FILE}
@@ -110,7 +111,14 @@ setup_election_mode() {
110111
done
111112

112113
if [ "$found" = "false" ]; then
113-
log_error "Could not find configuration for pod '${pod_name}' in POD_FQDN_LIST"
114+
log_info "Could not find configuration for pod '${pod_name}' in POD_FQDN_LIST"
115+
log_info "The pod may be removed by scale-in Ops"
116+
local retry_count=0
117+
while [ "$retry_count" -lt "$MAX_RETRY_TIMES" ]; do
118+
sleep ${RETRY_INTERVAL}
119+
retry_count=$((retry_count + 1))
120+
done
121+
log_error "Pod should be removed by scale-in Ops after ${retry_count} retries"
114122
fi
115123

116124
is_master_fe=$([[ "$pod_name" == "${pod_name_array[0]}" ]] && echo "true" || echo "false")

addons/doris/templates/cmpd-fe.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ spec:
231231
roleProbe:
232232
initialDelaySeconds: 15
233233
periodSeconds: 5
234+
timeoutSeconds: 3
234235
exec:
235236
container: fe
236237
command:
@@ -241,12 +242,13 @@ spec:
241242
retryPolicy:
242243
maxRetries: 3
243244
retryInterval: 5
245+
timeoutSeconds: 300
244246
exec:
245247
command:
246248
- /bin/bash
247249
- -c
248250
- |
249-
/opt/apache-doris/scripts/fe_member_leave.sh > /tmp/member-leave.log 2>&1
251+
/opt/apache-doris/scripts/fe_member_leave.sh >> /opt/apache-doris/fe/log/member-leave.log 2>&1
250252
targetPodSelector: Role
251253
container: fe
252254
matchingKey: master

0 commit comments

Comments
 (0)