Skip to content

[Backport into 5.18] disable node disconnect on error and detention #8902

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion config.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,13 @@ config.NODES_FREE_SPACE_RESERVE = 100 * (1024 ** 2);
// don't use agents with less than reserve + 5 GB
config.MINIMUM_AGENT_TOTAL_STORAGE = config.NODES_FREE_SPACE_RESERVE + (5 * (1024 ** 3));

config.NODE_IO_DETENTION_DISABLE = false;

// by default not disconnecting nodes on error. This caused more issues than benefits
config.NODES_DISCONNECT_ON_ERROR = false;

// by default not detaining nodes on io errors. This caused more issues than benefits
config.NODE_IO_DETENTION_DISABLE = true;

config.NODE_IO_DETENTION_THRESHOLD = 60 * 1000;
config.NODE_IO_DETENTION_RECENT_ISSUES = 5;
// Picked two because minimum of nodes per pool is three
Expand Down
24 changes: 14 additions & 10 deletions src/server/node_services/nodes_monitor.js
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ class NodesMonitor extends EventEmitter {
return P.resolve()
.then(() => this._run())
.then(() => {
// do nothing.
// do nothing.
});
}

Expand Down Expand Up @@ -1012,7 +1012,7 @@ class NodesMonitor extends EventEmitter {
})
.then(() => this._update_nodes_store('force'))
.then(() => {
// do nothing.
// do nothing.
});
}

Expand Down Expand Up @@ -1236,15 +1236,15 @@ class NodesMonitor extends EventEmitter {
if (item.node.deleted) return;
if (!item.connection) return;
if (!item.agent_info) return;
//The node should be set as enable if it is not decommissioned.
//The node should be set as enable if it is not decommissioned.
const should_enable = !item.node.decommissioned;
const item_pool = system_store.data.get_by_id(item.node.pool);
const location_info = {
node_id: String(item.node._id),
host_id: String(item.node.host_id),
pool_id: String(item.node.pool),
};
// We should only add region if it is defined.
// We should only add region if it is defined.
if (item_pool && !_.isUndefined(item_pool.region)) location_info.region = item_pool.region;
// We should change the service enable field if the field is not equal to the decommissioned decision.
const service_enabled_not_changed = (item.node.enabled === should_enable);
Expand Down Expand Up @@ -3373,12 +3373,16 @@ class NodesMonitor extends EventEmitter {
'node', item.node.name,
'issues_report', item.node.issues_report,
'block_report', block_report);
// disconnect from the node to force reconnect
// only disconnect if enough time passed since last disconnect to avoid amplification of errors in R\W flows
const DISCONNECT_GRACE_PERIOD = 2 * 60 * 1000; // 2 minutes grace before another disconnect
if (!item.disconnect_time || item.disconnect_time + DISCONNECT_GRACE_PERIOD < Date.now()) {
dbg.log0('disconnecting node to force reconnect. node:', item.node.name);
this._disconnect_node(item);


if (config.NODES_DISCONNECT_ON_ERROR) {
// disconnect from the node to force reconnect
// only disconnect if enough time passed since last disconnect to avoid amplification of errors in R\W flows
const DISCONNECT_GRACE_PERIOD = 2 * 60 * 1000; // 2 minutes grace before another disconnect
if (!item.disconnect_time || item.disconnect_time + DISCONNECT_GRACE_PERIOD < Date.now()) {
dbg.log0('disconnecting node to force reconnect. node:', item.node.name);
this._disconnect_node(item);
}
}
}
}
Expand Down