emqx · ieQu1 · Apr 18, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/README.md b/README.md
@@ -57,6 +57,40 @@ Transactions for each shard are replicated independently.
 Currently transaction can only modify tables in one shard.
 Usually it is a good idea to group all tables that belong to a particular OTP application in one shard.
 
+## Merge tables
+
+Tables where every entry includes ID of the node that created the record are called "merge tables".
+
+How to create such tables:
+
+```erlang
+mria:create_table(my_merged, [ {type, ordered_set}
+                             , {rlog_shard, shard_id}
+                             , {node_pattern, #my_record{key = {'_', '$1'}, _ = '_'}}
+                             , {merge_table, true}
+                             , {auto_clean, boolean()} %% Optional, default = false
+                             ])
+```
+
+1. All tables in the shard must have `merge_table` property = `true`.
+2. `node_pattern` property is mandatory.
+   Its value must be an ets match pattern with one free variable: `'$1'`.
+   Mria verifies that this value is set to `node()` for each record.
+3. `auto_clean` is an optional property that allows a downstream node to clean all records owned by an upstream node when the latter disconnects.
+
+Unlike regular tables,
+both cores and replicants use local writes to update such tables.
+In a clustered setup,
+the contents of the merge table consist of records from all reachable peer nodes.
+Records from remote nodes are read-only.
+
+### Limitations
+
+- Only `ram_copies` storage is currently supported.
+
+- Importing of data from remote nodes is always done using dirty operations.
+  `mria:transaction` and `mria:ro_transaction` interfaces do not guarantee atomicity when reading data from the remote nodes.
+
 ## Enabling RLOG in your application
 
 It is important to make the application code compatible with the RLOG feature by using the correct APIs.

diff --git a/rebar.config b/rebar.config
@@ -6,7 +6,8 @@
   {gen_rpc, {git, "https://github.com/emqx/gen_rpc", {tag, "3.4.1"}}},
   {replayq, {git, "https://github.com/emqx/replayq", {tag, "0.3.6"}}},
   {mnesia_rocksdb, {git, "https://github.com/emqx/mnesia_rocksdb", {tag, "0.1.17"}}},
-  {optvar, {git, "https://github.com/emqx/optvar", {tag, "1.0.5"}}}
+  {optvar, {git, "https://github.com/emqx/optvar", {tag, "1.0.5"}}},
+  {gproc, {git, "https://github.com/uwiger/gproc", {tag, "1.1.0"}}}
  ]}.
 
 {erl_opts,

diff --git a/src/mria.app.src b/src/mria.app.src
@@ -19,7 +19,8 @@
     gen_rpc,
     replayq,
     snabbkaffe,
-    optvar
+    optvar,
+    gproc
    ]},
   {modules, []},
   {licenses, ["Apache 2.0"]},

diff --git a/src/mria.erl b/src/mria.erl
@@ -1,5 +1,5 @@
 %%--------------------------------------------------------------------
-%% Copyright (c) 2019-2025 EMQ Technologies Co., Ltd. All Rights Reserved.
+%% Copyright (c) 2019-2026 EMQ Technologies Co., Ltd. All Rights Reserved.
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.
@@ -347,34 +347,35 @@ wait_for_tables(Tables) ->
     end.
 
 -spec ro_transaction(mria_rlog:shard(), fun(() -> A)) -> t_result(A).
-ro_transaction(?LOCAL_CONTENT_SHARD, Fun) ->
-    maybe_middleman(mnesia, transaction, [fun ro_transaction/1, [Fun]]);
 ro_transaction(Shard, Fun) ->
-    case mria_rlog:role() of
-        core ->
-            maybe_middleman(mnesia, transaction, [fun ro_transaction/1, [Fun]]);
-        replicant ->
-            ?tp(mria_ro_transaction, #{role => replicant}),
-            case mria_status:upstream(Shard) of
-                {ok, AgentPid} ->
-                    Ret = maybe_middleman(mnesia, transaction, [fun ro_transaction/1, [Fun]]),
-                    %% Now we check that the agent pid is still the
-                    %% same, meaning the replicant node haven't gone
-                    %% through bootstrapping process while running the
-                    %% transaction and it didn't have a chance to
-                    %% observe the stale writes.
-                    case mria_status:upstream(Shard) of
-                        {ok, AgentPid} ->
-                            Ret;
-                        _ ->
-                            %% Restart transaction. If the shard is
-                            %% still disconnected, it will become an
-                            %% RPC call to a core node:
-                            ro_transaction(Shard, Fun)
-                    end;
-                disconnected ->
-                    ro_trans_rpc(Shard, Fun)
-            end
+    maybe
+        {ok, Writes} ?= mria_rlog:shard_writes(Shard),
+        case Writes of
+            remote ->
+                ?tp(mria_ro_transaction, #{role => replicant}),
+                case mria_status:upstream(Shard) of
+                    {ok, AgentPid} ->
+                        Ret = maybe_middleman(mnesia, transaction, [fun ro_transaction/1, [Fun]]),
+                        %% Now we check that the agent pid is still the
+                        %% same, meaning the replicant node haven't gone
+                        %% through bootstrapping process while running the
+                        %% transaction and it didn't have a chance to
+                        %% observe the stale writes.
+                        case mria_status:upstream(Shard) of
+                            {ok, AgentPid} ->
+                                Ret;
+                            _ ->
+                                %% Restart transaction. If the shard is
+                                %% still disconnected, it will become an
+                                %% RPC call to a core node:
+                                ro_transaction(Shard, Fun)
+                        end;
+                    disconnected ->
+                        ro_trans_rpc(Shard, Fun)
+                end;
+            _ ->
+                maybe_middleman(mnesia, transaction, [fun ro_transaction/1, [Fun]])
+        end
     end.
 
 %% @doc Synchronous transaction.
@@ -392,15 +393,16 @@ ro_transaction(Shard, Fun) ->
 -spec sync_transaction(mria_rlog:shard(), fun((...) -> A), list(), timeout()) ->
           t_result(A) | {timeout, t_result(A)} | {timeout, {error, shard_not_ready}}.
 sync_transaction(Shard, Function, Args, ReplTimeout) ->
-    case {mria_config:whoami(), Shard} of
-        {mnesia, _} ->
-            maybe_middleman(mnesia, transaction, [Function, Args]);
-        {_, ?LOCAL_CONTENT_SHARD} ->
-            maybe_middleman(mria_upstream, transactional_wrapper, [?LOCAL_CONTENT_SHARD, Function, Args]);
-        {core, _} ->
-            maybe_middleman(mria_upstream, transactional_wrapper, [Shard, Function, Args]);
-        {replicant, _} ->
-            sync_replicant_trans(Shard, Function, Args, ReplTimeout)
+    maybe
+        {ok, Writes} ?= mria_rlog:shard_writes(Shard),
+        case Writes of
+            mnesia ->
+                maybe_middleman(mnesia, transaction, [Function, Args]);
+            local ->
+                maybe_middleman(mria_upstream, transactional_wrapper, [Shard, Function, Args]);
+            remote ->
+                sync_replicant_trans(Shard, Function, Args, ReplTimeout)
+        end
     end.
 
 -spec sync_transaction(mria_rlog:shard(), fun((...) -> A), list()) ->
@@ -415,34 +417,35 @@ sync_transaction(Shard, Fun) ->
 
 -spec transaction(mria_rlog:shard(), fun((...) -> A), list()) -> t_result(A).
 transaction(Shard, Function, Args) ->
-    case {mria_config:whoami(), Shard} of
-        {mnesia, _} ->
-            maybe_middleman(mnesia, transaction, [Function, Args]);
-        {_, ?LOCAL_CONTENT_SHARD} ->
-            maybe_middleman(mria_upstream, transactional_wrapper, [?LOCAL_CONTENT_SHARD, Function, Args]);
-        {core, _} ->
-            maybe_middleman(mria_upstream, transactional_wrapper, [Shard, Function, Args]);
-        {replicant, _} ->
-            rpc_to_core_node(Shard, mria_upstream, transactional_wrapper, [Shard, Function, Args])
+    maybe
+        {ok, Writes} ?= mria_rlog:shard_writes(Shard),
+        case Writes of
+            mnesia ->
+                maybe_middleman(mnesia, transaction, [Function, Args]);
+            local ->
+                maybe_middleman(mria_upstream, transactional_wrapper, [Shard, Function, Args]);
+            remote ->
+                rpc_to_core_node(Shard, mria_upstream, transactional_wrapper, [Shard, Function, Args])
+        end
     end.
 
 -spec transaction(mria_rlog:shard(), fun(() -> A)) -> t_result(A).
 transaction(Shard, Fun) ->
     transaction(Shard, Fun, []).
 
--spec async_dirty(mria_rlog:shard(), fun((...) -> A), list()) -> A | no_return().
+-spec async_dirty(mria_rlog:shard(), fun((...) -> A), list()) -> A.
 async_dirty(Shard, Fun, Args) ->
     call_backend_rw(Shard, mnesia, async_dirty, [Fun, Args]).
 
--spec async_dirty(mria_rlog:shard(), fun(() -> A)) -> A | no_return().
+-spec async_dirty(mria_rlog:shard(), fun(() -> A)) -> A.
 async_dirty(Shard, Fun) ->
     async_dirty(Shard, Fun, []).
 
--spec sync_dirty(mria_rlog:shard(), fun((...) -> A), list()) -> A | no_return().
+-spec sync_dirty(mria_rlog:shard(), fun((...) -> A), list()) -> A.
 sync_dirty(Shard, Fun, Args) ->
     call_backend_rw(Shard, mnesia, sync_dirty, [Fun, Args]).
 
--spec sync_dirty(mria_rlog:shard(), fun(() -> A)) -> A | no_return().
+-spec sync_dirty(mria_rlog:shard(), fun(() -> A)) -> A.
 sync_dirty(Shard, Fun) ->
     sync_dirty(Shard, Fun, []).
 
@@ -519,11 +522,14 @@ call_backend_rw_dirty(Function, Table, Args) ->
 
 -spec call_backend_rw(mria_rlog:shard(), module(), atom(), list()) -> term().
 call_backend_rw(Shard, Module, Function, Args) ->
-    case is_upstream(Shard) of
-        true ->
+    case mria_rlog:shard_writes(Shard) of
+        {ok, remote} ->
+            rpc_to_core_node(Shard, Module, Function, Args);
+        {ok, _} ->
+            %% Core or mnesia:
             maybe_middleman(Module, Function, Args);
-        false ->
-            rpc_to_core_node(Shard, Module, Function, Args)
+        Badshard ->
+            exit(Badshard)
     end.
 
 -spec maybe_middleman(module(), atom(), list()) -> term().
@@ -663,19 +669,6 @@ do_assert_ro_trans() ->
         Ops -> error({transaction_is_not_readonly, Ops})
     end.
 
-%% @doc Return `true' if the local node is the upstream for the shard.
--spec is_upstream(mria_rlog:shard()) -> boolean().
-is_upstream(Shard) ->
-    case mria_config:whoami() of
-        replicant ->
-            case Shard of
-                ?LOCAL_CONTENT_SHARD -> true;
-                _                    -> false
-            end;
-        _ -> % core or mnesia
-            true
-    end.
-
 %% Stop the application and reload the basic config from scratch.
 -spec prep_restart(stop_reason()) -> ok.
 prep_restart(Reason) ->

diff --git a/src/mria_bootstrapper.erl b/src/mria_bootstrapper.erl
@@ -1,5 +1,5 @@
 %%--------------------------------------------------------------------
-%% Copyright (c) 2021-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
+%% Copyright (c) 2021-2026 EMQ Technologies Co., Ltd. All Rights Reserved.
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.
@@ -45,16 +45,18 @@
 %%================================================================================
 
 -define(clear_table, clear_table).
+-define(clear_table(NODE), {clear_table, NODE}).
 
 -type batch() :: { _From    :: pid()
                  , _Table   :: mria:table()
-                 , _Records :: [tuple()] | ?clear_table
+                 , _Records :: [tuple()] | ?clear_table | ?clear_table(node())
                  }.
 
 -record(iter,
-        { table   :: mria:table()
-        , storage :: atom() | {ext, _, _}
-        , state   :: _
+        { table          :: mria:table()
+        , storage        :: atom() | {ext, _, _}
+        , state          :: _
+        , is_merge_shard :: boolean()
         }).
 
 -record(server,
@@ -65,9 +67,10 @@
         }).
 
 -record(client,
-        { shard       :: mria_rlog:shard()
-        , server      :: pid()
-        , parent      :: pid()
+        { shard          :: mria_rlog:shard()
+        , server         :: pid()
+        , parent         :: pid()
+        , is_merge_shard :: boolean()
         }).
 
 %%================================================================================
@@ -123,9 +126,11 @@ init({client, Shard, RemoteNode, Parent}) ->
                                  }),
     mria_status:notify_replicant_bootstrap_start(Shard),
     {ok, Pid} = mria_rlog_server:bootstrap_me(RemoteNode, Shard),
-    {ok, #client{ parent     = Parent
-                , shard      = Shard
-                , server     = Pid
+    {ok, IsMerge} = mria_schema:is_merge_shard(Shard),
+    {ok, #client{ parent         = Parent
+                , shard          = Shard
+                , server         = Pid
+                , is_merge_shard = IsMerge
                 }}.
 
 handle_info(loop, St = #server{}) ->
@@ -144,8 +149,8 @@ handle_call({complete, Server, Checkpoint}, From, St = #client{server = Server,
     gen_server:reply(From, ok),
     mria_status:notify_replicant_bootstrap_complete(Shard),
     {stop, normal, St};
-handle_call({batch, {Server, Table, Records}}, _From, St = #client{server = Server, shard = Shard}) ->
-    handle_batch(Server, Table, Records),
+handle_call({batch, {Server, Table, Records}}, _From, St = #client{server = Server, shard = Shard, is_merge_shard = IsMerge}) ->
+    handle_batch(IsMerge, Server, Table, Records),
     mria_status:notify_replicant_bootstrap_import(Shard),
     {reply, ok, St};
 handle_call(Call, From, St) ->
@@ -166,7 +171,8 @@ terminate(_Reason, St = #client{}) ->
 %% Internal functions
 %%================================================================================
 
--spec push_records(mria_lib:subscriber(), mria:table(), [tuple()] | ?clear_table) -> ok | {badrpc, _}.
+-spec push_records(mria_lib:subscriber(), mria:table(), Commands) -> ok | {badrpc, _}
+              when Commands :: [tuple()] | ?clear_table | ?clear_table(node()).
 push_records(Subscriber, Table, Records) ->
     push_batch(Subscriber, {self(), Table, Records}).
 
@@ -178,12 +184,18 @@ push_batch({Node, Pid}, Batch = {_, _, _}) ->
 complete({Node, Pid}, Server, Checkpoint) ->
     mria_lib:rpc_call_nothrow(Node, ?MODULE, do_complete, [Pid, Server, Checkpoint]).
 
-handle_batch(_Server, Table, ?clear_table) ->
+handle_batch(_IsMerge, _Server, Table, ?clear_table) ->
     mria_schema:ensure_local_table(Table),
     {atomic, ok} = mnesia:clear_table(Table),
     ok;
-handle_batch(_Server, Table, Records) ->
-    lists:foreach(fun(I) -> mnesia:dirty_write(Table, I) end, Records).
+handle_batch(_IsMerge, _Server, Table, ?clear_table(Node)) ->
+    mria_schema:ensure_local_table(Table),
+    mria_rlog_replica:clean_merge_table(Table, Node),
+    ok;
+handle_batch(false, _Server, Table, Records) ->
+    lists:foreach(fun(I) -> mnesia:dirty_write(Table, I) end, Records);
+handle_batch(true, _Server, Table, Records) ->
+    ets:insert(Table, Records).
 
 server_loop(St = #server{tables = [], subscriber = Subscriber, iterator = undefined}) ->
     %% All tables and chunks have been sent:
@@ -237,22 +249,36 @@ iter_start(Subscriber, Table, BatchSize) ->
     %% Push an empty batch to the replica to make sure it created the
     %% local table before we start actual iteration and the receiving
     %% table is empty:
-    push_records(Subscriber, Table, ?clear_table),
+    {ok, IsMerge} = mria_schema:is_merge_table(Table),
+    ClearCommand = case IsMerge of
+                       true  -> ?clear_table(node());
+                       false -> ?clear_table
+                   end,
+    push_records(Subscriber, Table, ClearCommand),
     %% Start iteration over records:
     mnesia_lib:db_fixtable(Storage, Table, true),
     Iter0 = #iter{ table = Table
                  , storage = Storage
+                 , is_merge_shard = IsMerge
                  },
-    case mnesia_lib:db_init_chunk(Storage, Table, BatchSize) of
+    InitChunk = case IsMerge of
+                    true ->
+                        {ok, NodePattern} = mria_schema:get_merged_table_node_pattern(Table),
+                        MS = {NodePattern, [{'==', '$1', node()}], ['$_']},
+                        ets:select(Table, [MS], BatchSize);
+                    false ->
+                        mnesia_lib:db_init_chunk(Storage, Table, BatchSize)
+                end,
+    case InitChunk of
         {Matches, Cont} ->
             {Iter0#iter{state = Cont}, Matches};
         ?end_of_table ->
             {Iter0, ?end_of_table}
     end.
 
 -spec iter_next(#iter{}) -> {#iter{}, [tuple()] | ?end_of_table}.
-iter_next(Iter0 = #iter{storage = Storage, state = State}) ->
-    case mnesia_lib:db_chunk(Storage, State) of
+iter_next(Iter0 = #iter{storage = Storage, state = State, is_merge_shard = IsMerge}) ->
+    case next_chunk(IsMerge, Storage, State) of
         {Matches, Cont} ->
             {Iter0#iter{state = Cont}, Matches};
         ?end_of_table ->
@@ -262,3 +288,8 @@ iter_next(Iter0 = #iter{storage = Storage, state = State}) ->
 -spec iter_end(#iter{}) -> ok.
 iter_end(#iter{table = Table, storage = Storage}) ->
     mnesia_lib:db_fixtable(Storage, Table, false).
+
+next_chunk(false, Storage, Iter) ->
+    mnesia_lib:db_chunk(Storage, Iter);
+next_chunk(true, ram_copies, Iter) ->
+    ets:select(Iter).