@@ -51,11 +51,14 @@ async fn run_daemon_loop(
5151 // would see a stale socket file and report "stale socket".
5252 let _socket_guard = SocketGuard :: new ( config. control_socket . clone ( ) ) ;
5353
54- let keys = crate :: keys:: NodeKeys :: load_or_generate ( & config. state_dir ) ?;
5554 let mut attempt: u32 = 0 ;
5655 let max_backoff = Duration :: from_secs ( 60 ) ;
5756
5857 loop {
58+ // Reload keys every iteration so a key rotation written in the previous
59+ // session is picked up before re-registering.
60+ let keys = crate :: keys:: NodeKeys :: load_or_generate ( & config. state_dir ) ?;
61+
5962 set_status ( & status, DaemonStatus :: Connecting ) ;
6063
6164 let session = run_session ( & config, & keys, & status, & shutdown) ;
@@ -71,6 +74,12 @@ async fn run_daemon_loop(
7174 } ;
7275
7376 match session_result {
77+ Ok ( SessionExit :: RotatedKey ) => {
78+ // Keys already written; reload happens at the top of the next
79+ // iteration. Skip backoff — we want to re-register quickly.
80+ attempt = 0 ;
81+ continue ;
82+ }
7483 Ok ( SessionExit :: Disconnected ) | Err ( _) => {
7584 if let Err ( ref e) = session_result {
7685 tracing:: error!( "session ended with error: {e:?}" ) ;
@@ -93,6 +102,9 @@ async fn run_daemon_loop(
93102
94103enum SessionExit {
95104 Disconnected ,
105+ /// The stuck-handshake watchdog rotated the node key. The outer loop must
106+ /// reload keys before starting the next session.
107+ RotatedKey ,
96108}
97109
98110/// RAII guard that removes a Unix socket file when dropped. Used by
@@ -592,12 +604,16 @@ async fn run_session(
592604 // map_stream_loop into the WG loop so update_peers() is called
593605 // on every netmap change, not just at startup.
594606 let ( peer_update_tx, peer_update_rx) = mpsc:: channel :: < Vec < Node > > ( 16 ) ;
607+ // Key-rotation signal: wg_loop → run_session. Capacity 1 — one pending
608+ // rotation is enough; extras are silently dropped via try_send.
609+ let ( rotate_key_tx, mut rotate_key_rx) = mpsc:: channel :: < ( ) > ( 1 ) ;
595610 let engine_to_wg_rx = channels. engine_to_wg_rx ;
596611 let wg_to_engine_tx = channels. wg_to_engine_tx ;
597612 let wg_cancel = cancel. clone ( ) ;
598613 // Convert disco keys to crypto_box types for NaCl seal/open.
599614 let disco_secret = crypto_box:: SecretKey :: from ( keys. disco_private . to_bytes ( ) ) ;
600615 let my_disco_pub: [ u8 ; 32 ] = * keys. disco_public . as_bytes ( ) ;
616+ let wg_status = status. clone ( ) ;
601617 let wg_task = tokio:: spawn ( async move {
602618 run_wg_loop (
603619 wg_tunnel,
@@ -612,6 +628,8 @@ async fn run_session(
612628 disco_secret,
613629 my_disco_pub,
614630 stun_bcast_tx. clone ( ) ,
631+ wg_status,
632+ rotate_key_tx,
615633 wg_cancel,
616634 )
617635 . await
@@ -640,11 +658,13 @@ async fn run_session(
640658 peer_count,
641659 derp_home,
642660 socks_proxy_port,
661+ last_handshake_fail : None ,
643662 } ,
644663 ) ;
645664
646665 // 11. Run concurrent tasks
647666 let route_whitelist = config. route_whitelist . clone ( ) ;
667+ let state_dir = config. state_dir . clone ( ) ;
648668 tokio:: select! {
649669 result = map_stream_loop( & mut map_stream, status, & route_table, & route_whitelist, & peer_update_tx, & derp_cmd_tx) => {
650670 eprintln!( "[session] map_stream_loop exited: {result:?}" ) ;
@@ -679,6 +699,17 @@ async fn run_session(
679699 cancel. cancel( ) ;
680700 result. map( |_| SessionExit :: Disconnected )
681701 }
702+ _ = rotate_key_rx. recv( ) => {
703+ tracing:: info!( "rotating node key after stuck-handshake watchdog escalation" ) ;
704+ cancel. cancel( ) ;
705+ match crate :: keys:: NodeKeys :: rotate_node_key( & state_dir) {
706+ Ok ( ( ) ) => Ok ( SessionExit :: RotatedKey ) ,
707+ Err ( e) => {
708+ tracing:: error!( "node key rotation failed: {e:?}" ) ;
709+ Ok ( SessionExit :: Disconnected )
710+ }
711+ }
712+ }
682713 }
683714}
684715
@@ -742,6 +773,8 @@ async fn run_wg_loop(
742773 disco_private : crypto_box:: SecretKey ,
743774 my_disco_pub : [ u8 ; 32 ] ,
744775 stun_response_tx : tokio:: sync:: broadcast:: Sender < ( std:: net:: SocketAddr , Vec < u8 > ) > ,
776+ status : Arc < RwLock < DaemonStatus > > ,
777+ rotate_key_tx : mpsc:: Sender < ( ) > ,
745778 cancel : tokio_util:: sync:: CancellationToken ,
746779) {
747780 let mut tick_interval = tokio:: time:: interval ( Duration :: from_millis ( 250 ) ) ;
@@ -852,10 +885,21 @@ async fn run_wg_loop(
852885 }
853886 }
854887 _ = tick_interval. tick( ) => {
855- let actions = tunnel. tick( ) ;
856- for ta in actions {
888+ let tick_result = tunnel. tick( ) ;
889+ for ta in tick_result . actions {
857890 dispatch_encap( ta. action, & derp_out_tx, & udp_out_tx) . await ;
858891 }
892+ if tick_result. had_connection_expired {
893+ let ts = std:: time:: SystemTime :: now( )
894+ . duration_since( std:: time:: UNIX_EPOCH )
895+ . unwrap_or_default( )
896+ . as_secs( ) ;
897+ update_last_handshake_fail( & status, ts) ;
898+ }
899+ if tick_result. needs_key_rotation {
900+ tracing:: warn!( "stuck-handshake watchdog: escalating to node-key rotation" ) ;
901+ let _ = rotate_key_tx. try_send( ( ) ) ;
902+ }
859903 }
860904 }
861905 }
@@ -1315,6 +1359,14 @@ fn update_peer_count(status: &Arc<RwLock<DaemonStatus>>, count: usize) {
13151359 }
13161360}
13171361
1362+ fn update_last_handshake_fail ( status : & Arc < RwLock < DaemonStatus > > , ts_secs : u64 ) {
1363+ if let Ok ( mut s) = status. write ( )
1364+ && let DaemonStatus :: Running { last_handshake_fail, .. } = & mut * s
1365+ {
1366+ * last_handshake_fail = Some ( ts_secs) ;
1367+ }
1368+ }
1369+
13181370#[ cfg( test) ]
13191371mod tests {
13201372 use super :: * ;
0 commit comments