Skip to content

Commit 887a161

Browse files
committed
How to recover from partitioning after 'pause_if_all_down' is configurable
Now that 'pause_if_all_down' accepts a list of preferred nodes, it is possible that these nodes are spread across multiple partitions. For example, suppose we have nodes A and B in datacenter #1 and nodes C and D in datacenter #2, and we set {pause_if_all_down, [A, C]}, If the link between both datacenters is lost, A/B and C/D forms two partitions. RabbitMQ continues to run at both sites because all nodes see at least one node from the preferred nodes list. When the link comes back, we need to handle the recovery. Therefore, a user can specify the strategy: o {pause_if_all_down, [...], ignore} (default) o {pause_if_all_down, [...], autoheal} This third parameter is mandatory. If the strategy is 'ignore', RabbitMQ is started again on paused nodes, as soon as they see another node from the preferred nodes list. This is the default behaviour. If the strategy is 'autoheal', RabbitMQ is started again, like in 'ignore' mode, but when all nodes are up, autohealing kicks in as well. Compared to plain 'autoheal' mode, the chance of loosing data is low because paused nodes never drifted away from the cluster. When they start again, they join the cluster and resume operations as any starting node.
1 parent 6513caf commit 887a161

File tree

2 files changed

+15
-6
lines changed

2 files changed

+15
-6
lines changed

src/rabbit_autoheal.erl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
-module(rabbit_autoheal).
1818

19-
-export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]).
19+
-export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
20+
handle_msg/3]).
2021

2122
%% The named process we are running in.
2223
-define(SERVER, rabbit_node_monitor).
@@ -80,7 +81,11 @@ maybe_start(State) ->
8081
State.
8182

8283
enabled() ->
83-
{ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling).
84+
case application:get_env(rabbit, cluster_partition_handling) of
85+
{ok, autoheal} -> true;
86+
{ok, {pause_if_all_down, _, autoheal}} -> true;
87+
_ -> false
88+
end.
8489

8590

8691
%% This is the winner receiving its last notification that a node has

src/rabbit_node_monitor.erl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ pause_partition_guard() ->
221221
case M of
222222
pause_minority ->
223223
pause_minority_guard([]);
224-
{pause_if_all_down, PreferredNodes} ->
224+
{pause_if_all_down, PreferredNodes, _} ->
225225
case verify_pause_if_all_down_list(PreferredNodes) of
226226
[] -> put(pause_partition_guard, not_pause_mode),
227227
ok;
@@ -562,7 +562,7 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
562562
false -> await_cluster_recovery(fun majority/0)
563563
end,
564564
State;
565-
{ok, {pause_if_all_down, PreferredNodes}} ->
565+
{ok, {pause_if_all_down, PreferredNodes, HowToRecover}} ->
566566
case verify_pause_if_all_down_list(PreferredNodes) of
567567
[] -> ok;
568568
Nodes -> case in_preferred_partition(Nodes) of
@@ -571,7 +571,11 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
571571
fun in_preferred_partition/0)
572572
end
573573
end,
574-
State;
574+
case HowToRecover of
575+
autoheal -> State#state{autoheal =
576+
rabbit_autoheal:node_down(Node, Autoheal)};
577+
_ -> State
578+
end;
575579
{ok, ignore} ->
576580
State;
577581
{ok, autoheal} ->
@@ -747,7 +751,7 @@ majority() ->
747751
length(alive_nodes(Nodes)) / length(Nodes) > 0.5.
748752

749753
in_preferred_partition() ->
750-
{ok, {pause_if_all_down, PreferredNodes}} =
754+
{ok, {pause_if_all_down, PreferredNodes, _}} =
751755
application:get_env(rabbit, cluster_partition_handling),
752756
in_preferred_partition(PreferredNodes).
753757

0 commit comments

Comments
 (0)