@@ -51,6 +51,10 @@ pub const BeamNode = struct {
5151 logger : zeam_utils.ModuleLogger ,
5252 node_registry : * const NodeNameRegistry ,
5353
54+ // Stall detection: track head slot progression
55+ last_checked_head_slot : types.Slot = 0 ,
56+ stall_interval_count : usize = 0 ,
57+
5458 const Self = @This ();
5559
5660 pub fn init (self : * Self , allocator : Allocator , opts : NodeOpts ) ! void {
@@ -490,12 +494,20 @@ pub const BeamNode = struct {
490494 const missing_roots = self .chain .onBlock (signed_block .* , .{}) catch | err | {
491495 // Check if the error is due to missing parent
492496 if (err == chainFactory .BlockProcessingError .MissingPreState ) {
493- // Check if we've hit the max depth
497+ // Check if we've hit the max depth - this strongly suggests fork divergence
494498 if (current_depth >= constants .MAX_BLOCK_FETCH_DEPTH ) {
495- self .logger .warn (
496- "Reached max block fetch depth ({d}) for block 0x{x}, discarding" ,
497- .{ constants .MAX_BLOCK_FETCH_DEPTH , & block_root },
499+ self .logger .err (
500+ "FORK DIVERGENCE LIKELY: Reached max block fetch depth ({d}) for block 0x{x} at slot {d} without finding common ancestor. " ++
501+ "Our finalized slot={d}. Checkpoint sync required." ,
502+ .{
503+ constants .MAX_BLOCK_FETCH_DEPTH ,
504+ & block_root ,
505+ signed_block .message .block .slot ,
506+ self .chain .forkChoice .fcStore .latest_finalized .slot ,
507+ },
498508 );
509+ // TODO: Trigger automatic checkpoint sync recovery here.
510+ _ = self .network .pruneCachedBlocks (block_root , null );
499511 return ;
500512 }
501513
@@ -511,14 +523,33 @@ pub const BeamNode = struct {
511523 );
512524 } else | cache_err | {
513525 if (cache_err == CacheBlockError .PreFinalized ) {
514- // Block is pre-finalized - prune any cached descendants waiting for this parent
515- self .logger .info (
516- "block 0x{x} is pre-finalized (slot={d}), pruning cached descendants" ,
517- .{
518- & block_root ,
519- signed_block .message .block .slot ,
520- },
521- );
526+ // Block is pre-finalized but we got MissingPreState - we don't have its parent.
527+ // This means the parent chain from peers doesn't connect to our finalized chain.
528+ // This is definitive FORK DIVERGENCE.
529+ const parent_root = signed_block .message .block .parent_root ;
530+ const have_parent = self .chain .forkChoice .hasBlock (parent_root );
531+
532+ if (! have_parent ) {
533+ self .logger .err (
534+ "FORK DIVERGENCE DETECTED: block 0x{x} at slot {d} is pre-finalized but parent 0x{x} not in our chain. " ++
535+ "Peer's chain diverged before our finalized slot {d}. Checkpoint sync required." ,
536+ .{
537+ & block_root ,
538+ signed_block .message .block .slot ,
539+ & parent_root ,
540+ self .chain .forkChoice .fcStore .latest_finalized .slot ,
541+ },
542+ );
543+ // TODO: Trigger automatic checkpoint sync recovery here.
544+ } else {
545+ self .logger .info (
546+ "block 0x{x} is pre-finalized (slot={d}), pruning cached descendants" ,
547+ .{
548+ & block_root ,
549+ signed_block .message .block .slot ,
550+ },
551+ );
552+ }
522553 _ = self .network .pruneCachedBlocks (block_root , null );
523554 } else {
524555 self .logger .warn ("failed to cache block 0x{x}: {any}" , .{
@@ -612,6 +643,8 @@ pub const BeamNode = struct {
612643 switch (sync_status ) {
613644 .behind_peers = > | info | {
614645 // Only sync from this peer if their finalized slot is ahead of ours
646+ // Note: Fork divergence is already detected by getSyncStatus() which checks
647+ // if peer.finalized_slot <= our_head_slot AND we don't have their block.
615648 if (status_resp .finalized_slot > self .chain .forkChoice .fcStore .latest_finalized .slot ) {
616649 self .logger .info ("peer {s}{any} is ahead (peer_finalized_slot={d} > our_head_slot={d}), initiating sync by requesting head block 0x{x}" , .{
617650 status_ctx .peer_id ,
@@ -630,6 +663,15 @@ pub const BeamNode = struct {
630663 };
631664 }
632665 },
666+ .fork_diverged = > | diverge_info | {
667+ self .logger .err ("FORK DIVERGENCE DETECTED: our finalized=0x{x} at slot {d}, peer finalized=0x{x} at slot {d}. Checkpoint sync required to recover." , .{
668+ & diverge_info .our_finalized_root ,
669+ diverge_info .our_finalized_slot ,
670+ & diverge_info .peer_finalized_root ,
671+ diverge_info .peer_finalized_slot ,
672+ });
673+ // TODO: Trigger automatic checkpoint sync recovery here.
674+ },
633675 .synced , .no_peers = > {},
634676 }
635677 },
@@ -909,6 +951,9 @@ pub const BeamNode = struct {
909951 // Sweep timed-out RPC requests to prevent sync stalls from non-responsive peers
910952 self .sweepTimedOutRequests ();
911953
954+ // Stall detection: if head hasn't advanced while behind peers, we may be stuck
955+ self .checkSyncStall ();
956+
912957 if (self .validator ) | * validator | {
913958 // we also tick validator per interval in case it would
914959 // need to sync its future duties when its an independent validator
@@ -993,6 +1038,46 @@ pub const BeamNode = struct {
9931038 }
9941039 }
9951040
1041+ /// Detects sync stalls: when head hasn't advanced while we're behind peers
1042+ fn checkSyncStall (self : * Self ) void {
1043+ const current_head_slot = self .chain .forkChoice .head .slot ;
1044+ const cached_blocks = self .network .fetched_blocks .count ();
1045+
1046+ // If head advanced, reset stall counter
1047+ if (current_head_slot > self .last_checked_head_slot ) {
1048+ self .last_checked_head_slot = current_head_slot ;
1049+ self .stall_interval_count = 0 ;
1050+ return ;
1051+ }
1052+
1053+ // Check if we're behind peers
1054+ const sync_status = self .chain .getSyncStatus ();
1055+ const is_behind = switch (sync_status ) {
1056+ .behind_peers , .fork_diverged = > true ,
1057+ .synced , .no_peers = > false ,
1058+ };
1059+
1060+ if (! is_behind ) {
1061+ self .stall_interval_count = 0 ;
1062+ return ;
1063+ }
1064+
1065+ // Increment stall counter
1066+ self .stall_interval_count += 1 ;
1067+
1068+ // Log warning every 60 intervals (~60 seconds with 1s intervals)
1069+ // and if we have significant cached blocks (indicating sync attempts)
1070+ if (self .stall_interval_count > 0 and self .stall_interval_count % 60 == 0 ) {
1071+ self .logger .err ("SYNC STALL DETECTED: head stuck at slot {d} for {d} intervals, cached_blocks={d}, forkchoice_nodes={d}. Consider checkpoint sync." , .{
1072+ current_head_slot ,
1073+ self .stall_interval_count ,
1074+ cached_blocks ,
1075+ self .chain .forkChoice .getNodeCount (),
1076+ });
1077+ // TODO: After prolonged stall (e.g., 5+ minutes), automatically trigger checkpoint sync.
1078+ }
1079+ }
1080+
9961081 pub fn publishBlock (self : * Self , signed_block : types.SignedBlockWithAttestation ) ! void {
9971082 const block = signed_block .message .block ;
9981083
0 commit comments