Skip to content

Commit 575e08d

Browse files
committed
fix(vpn): self-recover from expired WG sessions and dedupe log spam
When boringtun returns ConnectionExpired, the daemon previously entered a catatonic loop — the dead Tunn kept returning the same error on every tick without ever re-handshaking. The tunnel is now rebuilt immediately on ConnectionExpired (fix #1), eliminating the deadlock. A per-peer consecutive-rebuild counter escalates to a node-key rotation signal after three rebuilds without a successful handshake, covering the case where the server-side session is also stuck and needs a NodeKey change to force a PeersChanged push (fix #2). Per-peer warn rate-limiting suppresses the 30+ duplicate log lines that previously appeared within 100ms when NoCurrentSession errors flooded decapsulate (fix #4). The Running daemon status now carries last_handshake_fail so operators can see when the most recent expiry occurred via sunbeam vpn status (fix #3). Signed-off-by: Sienna Meridian Satterwhite <sienna@r3t.io>
1 parent e2d9d8e commit 575e08d

9 files changed

Lines changed: 378 additions & 22 deletions

File tree

platform/cli/sunbeam-net/src/daemon/endpoint.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ struct PeerEndpoint {
2626
last_call_me_maybe: Option<Instant>,
2727
}
2828

29+
impl Default for EndpointTracker {
30+
fn default() -> Self {
31+
Self::new()
32+
}
33+
}
34+
2935
impl EndpointTracker {
3036
pub fn new() -> Self {
3137
Self {

platform/cli/sunbeam-net/src/daemon/ipc.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@ mod tests {
367367
peer_count: 3,
368368
derp_home: Some(1),
369369
socks_proxy_port: None,
370+
last_handshake_fail: None,
370371
}));
371372

372373
let server = IpcServer::new(

platform/cli/sunbeam-net/src/daemon/lifecycle.rs

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,14 @@ async fn run_daemon_loop(
5151
// would see a stale socket file and report "stale socket".
5252
let _socket_guard = SocketGuard::new(config.control_socket.clone());
5353

54-
let keys = crate::keys::NodeKeys::load_or_generate(&config.state_dir)?;
5554
let mut attempt: u32 = 0;
5655
let max_backoff = Duration::from_secs(60);
5756

5857
loop {
58+
// Reload keys every iteration so a key rotation written in the previous
59+
// session is picked up before re-registering.
60+
let keys = crate::keys::NodeKeys::load_or_generate(&config.state_dir)?;
61+
5962
set_status(&status, DaemonStatus::Connecting);
6063

6164
let session = run_session(&config, &keys, &status, &shutdown);
@@ -71,6 +74,12 @@ async fn run_daemon_loop(
7174
};
7275

7376
match session_result {
77+
Ok(SessionExit::RotatedKey) => {
78+
// Keys already written; reload happens at the top of the next
79+
// iteration. Skip backoff — we want to re-register quickly.
80+
attempt = 0;
81+
continue;
82+
}
7483
Ok(SessionExit::Disconnected) | Err(_) => {
7584
if let Err(ref e) = session_result {
7685
tracing::error!("session ended with error: {e:?}");
@@ -93,6 +102,9 @@ async fn run_daemon_loop(
93102

94103
enum SessionExit {
95104
Disconnected,
105+
/// The stuck-handshake watchdog rotated the node key. The outer loop must
106+
/// reload keys before starting the next session.
107+
RotatedKey,
96108
}
97109

98110
/// RAII guard that removes a Unix socket file when dropped. Used by
@@ -592,12 +604,16 @@ async fn run_session(
592604
// map_stream_loop into the WG loop so update_peers() is called
593605
// on every netmap change, not just at startup.
594606
let (peer_update_tx, peer_update_rx) = mpsc::channel::<Vec<Node>>(16);
607+
// Key-rotation signal: wg_loop → run_session. Capacity 1 — one pending
608+
// rotation is enough; extras are silently dropped via try_send.
609+
let (rotate_key_tx, mut rotate_key_rx) = mpsc::channel::<()>(1);
595610
let engine_to_wg_rx = channels.engine_to_wg_rx;
596611
let wg_to_engine_tx = channels.wg_to_engine_tx;
597612
let wg_cancel = cancel.clone();
598613
// Convert disco keys to crypto_box types for NaCl seal/open.
599614
let disco_secret = crypto_box::SecretKey::from(keys.disco_private.to_bytes());
600615
let my_disco_pub: [u8; 32] = *keys.disco_public.as_bytes();
616+
let wg_status = status.clone();
601617
let wg_task = tokio::spawn(async move {
602618
run_wg_loop(
603619
wg_tunnel,
@@ -612,6 +628,8 @@ async fn run_session(
612628
disco_secret,
613629
my_disco_pub,
614630
stun_bcast_tx.clone(),
631+
wg_status,
632+
rotate_key_tx,
615633
wg_cancel,
616634
)
617635
.await
@@ -640,11 +658,13 @@ async fn run_session(
640658
peer_count,
641659
derp_home,
642660
socks_proxy_port,
661+
last_handshake_fail: None,
643662
},
644663
);
645664

646665
// 11. Run concurrent tasks
647666
let route_whitelist = config.route_whitelist.clone();
667+
let state_dir = config.state_dir.clone();
648668
tokio::select! {
649669
result = map_stream_loop(&mut map_stream, status, &route_table, &route_whitelist, &peer_update_tx, &derp_cmd_tx) => {
650670
eprintln!("[session] map_stream_loop exited: {result:?}");
@@ -679,6 +699,17 @@ async fn run_session(
679699
cancel.cancel();
680700
result.map(|_| SessionExit::Disconnected)
681701
}
702+
_ = rotate_key_rx.recv() => {
703+
tracing::info!("rotating node key after stuck-handshake watchdog escalation");
704+
cancel.cancel();
705+
match crate::keys::NodeKeys::rotate_node_key(&state_dir) {
706+
Ok(()) => Ok(SessionExit::RotatedKey),
707+
Err(e) => {
708+
tracing::error!("node key rotation failed: {e:?}");
709+
Ok(SessionExit::Disconnected)
710+
}
711+
}
712+
}
682713
}
683714
}
684715

@@ -742,6 +773,8 @@ async fn run_wg_loop(
742773
disco_private: crypto_box::SecretKey,
743774
my_disco_pub: [u8; 32],
744775
stun_response_tx: tokio::sync::broadcast::Sender<(std::net::SocketAddr, Vec<u8>)>,
776+
status: Arc<RwLock<DaemonStatus>>,
777+
rotate_key_tx: mpsc::Sender<()>,
745778
cancel: tokio_util::sync::CancellationToken,
746779
) {
747780
let mut tick_interval = tokio::time::interval(Duration::from_millis(250));
@@ -852,10 +885,21 @@ async fn run_wg_loop(
852885
}
853886
}
854887
_ = tick_interval.tick() => {
855-
let actions = tunnel.tick();
856-
for ta in actions {
888+
let tick_result = tunnel.tick();
889+
for ta in tick_result.actions {
857890
dispatch_encap(ta.action, &derp_out_tx, &udp_out_tx).await;
858891
}
892+
if tick_result.had_connection_expired {
893+
let ts = std::time::SystemTime::now()
894+
.duration_since(std::time::UNIX_EPOCH)
895+
.unwrap_or_default()
896+
.as_secs();
897+
update_last_handshake_fail(&status, ts);
898+
}
899+
if tick_result.needs_key_rotation {
900+
tracing::warn!("stuck-handshake watchdog: escalating to node-key rotation");
901+
let _ = rotate_key_tx.try_send(());
902+
}
859903
}
860904
}
861905
}
@@ -1315,6 +1359,14 @@ fn update_peer_count(status: &Arc<RwLock<DaemonStatus>>, count: usize) {
13151359
}
13161360
}
13171361

1362+
fn update_last_handshake_fail(status: &Arc<RwLock<DaemonStatus>>, ts_secs: u64) {
1363+
if let Ok(mut s) = status.write()
1364+
&& let DaemonStatus::Running { last_handshake_fail, .. } = &mut *s
1365+
{
1366+
*last_handshake_fail = Some(ts_secs);
1367+
}
1368+
}
1369+
13181370
#[cfg(test)]
13191371
mod tests {
13201372
use super::*;

platform/cli/sunbeam-net/src/daemon/state.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ pub enum DaemonStatus {
3030
/// matching auth token from `{state_dir}/socks5.auth`.
3131
#[serde(default)]
3232
socks_proxy_port: Option<u16>,
33+
/// Unix timestamp (seconds) of the most recent `ConnectionExpired`
34+
/// from any peer's WireGuard session. `None` if no expiry has
35+
/// occurred since the daemon last reached Running. Backwards-compatible:
36+
/// old daemons omit this field and clients deserialize it as `None`.
37+
#[serde(default)]
38+
last_handshake_fail: Option<u64>,
3339
},
3440
/// Reconnecting after a connection loss.
3541
Reconnecting {
@@ -170,6 +176,7 @@ mod tests {
170176
peer_count: 3,
171177
derp_home: Some(1),
172178
socks_proxy_port: Some(16580),
179+
last_handshake_fail: None,
173180
};
174181
assert_eq!(running.to_string(), "running (100.64.0.1), 3 peers");
175182
}
@@ -181,12 +188,49 @@ mod tests {
181188
peer_count: 5,
182189
derp_home: Some(2),
183190
socks_proxy_port: None,
191+
last_handshake_fail: None,
184192
};
185193
let json = serde_json::to_string(&status).unwrap();
186194
let deserialized: DaemonStatus = serde_json::from_str(&json).unwrap();
187195
assert_eq!(status, deserialized);
188196
}
189197

198+
#[test]
199+
fn daemon_status_serializes_optional_last_handshake_fail() {
200+
// Round-trip with Some timestamp.
201+
let now_secs: u64 = 1_745_000_000;
202+
let with_fail = DaemonStatus::Running {
203+
addresses: vec!["100.64.0.1".parse().unwrap()],
204+
peer_count: 1,
205+
derp_home: None,
206+
socks_proxy_port: None,
207+
last_handshake_fail: Some(now_secs),
208+
};
209+
let json = serde_json::to_string(&with_fail).unwrap();
210+
let back: DaemonStatus = serde_json::from_str(&json).unwrap();
211+
assert_eq!(with_fail, back);
212+
213+
// Round-trip with None.
214+
let without_fail = DaemonStatus::Running {
215+
addresses: vec![],
216+
peer_count: 0,
217+
derp_home: None,
218+
socks_proxy_port: None,
219+
last_handshake_fail: None,
220+
};
221+
let json2 = serde_json::to_string(&without_fail).unwrap();
222+
let back2: DaemonStatus = serde_json::from_str(&json2).unwrap();
223+
assert_eq!(without_fail, back2);
224+
225+
// Backwards-compat: JSON missing the field deserializes as None.
226+
let legacy_json = r#"{"running":{"addresses":["100.64.0.2"],"peer_count":2,"derp_home":1,"socks_proxy_port":null}}"#;
227+
let legacy: DaemonStatus = serde_json::from_str(legacy_json).unwrap();
228+
assert!(matches!(
229+
legacy,
230+
DaemonStatus::Running { last_handshake_fail: None, .. }
231+
));
232+
}
233+
190234
#[test]
191235
fn daemon_handle_default_status() {
192236
let handle = DaemonHandle::new("/tmp/test.sock".into());

platform/cli/sunbeam-net/src/discovery/registry.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,10 @@ impl RegistryWatcher {
153153
match tokio::fs::metadata(&path).await {
154154
Ok(meta) => {
155155
let mt = meta.modified().ok();
156-
if mt != last_mtime {
157-
if let Some(new_mt) = reload(&path, &registry) {
158-
last_mtime = Some(new_mt);
159-
}
156+
if mt != last_mtime
157+
&& let Some(new_mt) = reload(&path, &registry)
158+
{
159+
last_mtime = Some(new_mt);
160160
}
161161
}
162162
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {

platform/cli/sunbeam-net/src/keys.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,25 @@ impl NodeKeys {
9494
Ok(keys)
9595
}
9696

97+
/// Rotate only the `node_private` key in `state_dir/keys.json`, preserving
98+
/// the disco and wg keys. Writes atomically: temp file then rename so a
99+
/// crash mid-write can't corrupt the existing key.
100+
pub fn rotate_node_key(state_dir: &Path) -> crate::Result<()> {
101+
let path = state_dir.join(KEYS_FILE);
102+
let existing_data = std::fs::read_to_string(&path).ctx("reading keys for rotation")?;
103+
let mut persisted: PersistedKeys =
104+
serde_json::from_str(&existing_data).map_err(crate::Error::Json)?;
105+
let new_node_private = StaticSecret::random_from_rng(OsRng);
106+
persisted.node_private = hex::encode(new_node_private.as_bytes());
107+
let new_data = serde_json::to_string_pretty(&persisted)?;
108+
// Write to a temp file in the same directory then atomically rename.
109+
let tmp_path = path.with_extension("json.tmp");
110+
std::fs::write(&tmp_path, new_data).ctx("writing rotated key to temp file")?;
111+
std::fs::rename(&tmp_path, &path).ctx("renaming rotated key file")?;
112+
tracing::info!("node key rotated and persisted to {}", path.display());
113+
Ok(())
114+
}
115+
97116
/// Tailscale-style node key string: `nodekey:<hex>`.
98117
pub fn node_key_str(&self) -> String {
99118
format!("nodekey:{}", hex::encode(self.node_public.as_bytes()))

platform/cli/sunbeam-net/src/noise/handshake.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@ mod tests {
438438
// Ciphertext should differ from plaintext.
439439
assert_ne!(&data[..], b"hello");
440440

441-
assert_eq!(result.protocol_version, 49);
441+
assert_eq!(result.protocol_version, crate::CURRENT_CAP_VER);
442442

443443
server_handle.await.unwrap();
444444
}

0 commit comments

Comments
 (0)