Skip to content

Commit 175e88f

Browse files
committed
prevent fork divergence false positives for behind peers
1 parent 8e77865 commit 175e88f

File tree

3 files changed

+148
-12
lines changed

3 files changed

+148
-12
lines changed

pkgs/node/src/chain.zig

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,6 +1343,12 @@ pub const BeamChain = struct {
13431343
finalized_slot: types.Slot,
13441344
max_peer_finalized_slot: types.Slot,
13451345
},
1346+
fork_diverged: struct {
1347+
our_finalized_slot: types.Slot,
1348+
our_finalized_root: types.Root,
1349+
peer_finalized_slot: types.Slot,
1350+
peer_finalized_root: types.Root,
1351+
},
13461352
};
13471353

13481354
/// Returns detailed sync status information.
@@ -1355,9 +1361,13 @@ pub const BeamChain = struct {
13551361

13561362
const our_head_slot = self.forkChoice.head.slot;
13571363
const our_finalized_slot = self.forkChoice.fcStore.latest_finalized.slot;
1364+
const our_finalized_root = self.forkChoice.fcStore.latest_finalized.root;
13581365

13591366
// Find the maximum finalized slot reported by any peer
13601367
var max_peer_finalized_slot: types.Slot = our_finalized_slot;
1368+
var found_fork_divergence: bool = false;
1369+
var diverged_peer_finalized_slot: types.Slot = 0;
1370+
var diverged_peer_finalized_root: types.Root = undefined;
13611371

13621372
var peer_iter = self.connected_peers.iterator();
13631373
while (peer_iter.next()) |entry| {
@@ -1366,9 +1376,33 @@ pub const BeamChain = struct {
13661376
if (status.finalized_slot > max_peer_finalized_slot) {
13671377
max_peer_finalized_slot = status.finalized_slot;
13681378
}
1379+
1380+
// Fork divergence check: if peer's finalized slot is at or ahead of our finalized slot
1381+
// but at or before our head, we should have their finalized block in forkchoice.
1382+
// If we don't, we're on a different fork.
1383+
// NOTE: We only check when peer.finalized >= our.finalized because blocks before
1384+
// our finalized checkpoint may have been pruned from forkchoice.
1385+
if (status.finalized_slot >= our_finalized_slot and
1386+
status.finalized_slot <= our_head_slot and
1387+
!self.forkChoice.hasBlock(status.finalized_root))
1388+
{
1389+
found_fork_divergence = true;
1390+
diverged_peer_finalized_slot = status.finalized_slot;
1391+
diverged_peer_finalized_root = status.finalized_root;
1392+
}
13691393
}
13701394
}
13711395

1396+
// Check 0: fork divergence detected — we're on a different chain than peers
1397+
if (found_fork_divergence) {
1398+
return .{ .fork_diverged = .{
1399+
.our_finalized_slot = our_finalized_slot,
1400+
.our_finalized_root = our_finalized_root,
1401+
.peer_finalized_slot = diverged_peer_finalized_slot,
1402+
.peer_finalized_root = diverged_peer_finalized_root,
1403+
} };
1404+
}
1405+
13721406
// Check 1: our head is behind peer finalization — we don't even have finalized blocks
13731407
if (our_head_slot < max_peer_finalized_slot) {
13741408
return .{ .behind_peers = .{

pkgs/node/src/node.zig

Lines changed: 97 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ pub const BeamNode = struct {
5151
logger: zeam_utils.ModuleLogger,
5252
node_registry: *const NodeNameRegistry,
5353

54+
// Stall detection: track head slot progression
55+
last_checked_head_slot: types.Slot = 0,
56+
stall_interval_count: usize = 0,
57+
5458
const Self = @This();
5559

5660
pub fn init(self: *Self, allocator: Allocator, opts: NodeOpts) !void {
@@ -490,12 +494,20 @@ pub const BeamNode = struct {
490494
const missing_roots = self.chain.onBlock(signed_block.*, .{}) catch |err| {
491495
// Check if the error is due to missing parent
492496
if (err == chainFactory.BlockProcessingError.MissingPreState) {
493-
// Check if we've hit the max depth
497+
// Check if we've hit the max depth - this strongly suggests fork divergence
494498
if (current_depth >= constants.MAX_BLOCK_FETCH_DEPTH) {
495-
self.logger.warn(
496-
"Reached max block fetch depth ({d}) for block 0x{x}, discarding",
497-
.{ constants.MAX_BLOCK_FETCH_DEPTH, &block_root },
499+
self.logger.err(
500+
"FORK DIVERGENCE LIKELY: Reached max block fetch depth ({d}) for block 0x{x} at slot {d} without finding common ancestor. " ++
501+
"Our finalized slot={d}. Checkpoint sync required.",
502+
.{
503+
constants.MAX_BLOCK_FETCH_DEPTH,
504+
&block_root,
505+
signed_block.message.block.slot,
506+
self.chain.forkChoice.fcStore.latest_finalized.slot,
507+
},
498508
);
509+
// TODO: Trigger automatic checkpoint sync recovery here.
510+
_ = self.network.pruneCachedBlocks(block_root, null);
499511
return;
500512
}
501513

@@ -511,14 +523,33 @@ pub const BeamNode = struct {
511523
);
512524
} else |cache_err| {
513525
if (cache_err == CacheBlockError.PreFinalized) {
514-
// Block is pre-finalized - prune any cached descendants waiting for this parent
515-
self.logger.info(
516-
"block 0x{x} is pre-finalized (slot={d}), pruning cached descendants",
517-
.{
518-
&block_root,
519-
signed_block.message.block.slot,
520-
},
521-
);
526+
// Block is pre-finalized but we got MissingPreState - we don't have its parent.
527+
// This means the parent chain from peers doesn't connect to our finalized chain.
528+
// This is definitive FORK DIVERGENCE.
529+
const parent_root = signed_block.message.block.parent_root;
530+
const have_parent = self.chain.forkChoice.hasBlock(parent_root);
531+
532+
if (!have_parent) {
533+
self.logger.err(
534+
"FORK DIVERGENCE DETECTED: block 0x{x} at slot {d} is pre-finalized but parent 0x{x} not in our chain. " ++
535+
"Peer's chain diverged before our finalized slot {d}. Checkpoint sync required.",
536+
.{
537+
&block_root,
538+
signed_block.message.block.slot,
539+
&parent_root,
540+
self.chain.forkChoice.fcStore.latest_finalized.slot,
541+
},
542+
);
543+
// TODO: Trigger automatic checkpoint sync recovery here.
544+
} else {
545+
self.logger.info(
546+
"block 0x{x} is pre-finalized (slot={d}), pruning cached descendants",
547+
.{
548+
&block_root,
549+
signed_block.message.block.slot,
550+
},
551+
);
552+
}
522553
_ = self.network.pruneCachedBlocks(block_root, null);
523554
} else {
524555
self.logger.warn("failed to cache block 0x{x}: {any}", .{
@@ -612,6 +643,8 @@ pub const BeamNode = struct {
612643
switch (sync_status) {
613644
.behind_peers => |info| {
614645
// Only sync from this peer if their finalized slot is ahead of ours
646+
// Note: Fork divergence is already detected by getSyncStatus() which checks
647+
// if peer.finalized_slot <= our_head_slot AND we don't have their block.
615648
if (status_resp.finalized_slot > self.chain.forkChoice.fcStore.latest_finalized.slot) {
616649
self.logger.info("peer {s}{any} is ahead (peer_finalized_slot={d} > our_head_slot={d}), initiating sync by requesting head block 0x{x}", .{
617650
status_ctx.peer_id,
@@ -630,6 +663,15 @@ pub const BeamNode = struct {
630663
};
631664
}
632665
},
666+
.fork_diverged => |diverge_info| {
667+
self.logger.err("FORK DIVERGENCE DETECTED: our finalized=0x{x} at slot {d}, peer finalized=0x{x} at slot {d}. Checkpoint sync required to recover.", .{
668+
&diverge_info.our_finalized_root,
669+
diverge_info.our_finalized_slot,
670+
&diverge_info.peer_finalized_root,
671+
diverge_info.peer_finalized_slot,
672+
});
673+
// TODO: Trigger automatic checkpoint sync recovery here.
674+
},
633675
.synced, .no_peers => {},
634676
}
635677
},
@@ -909,6 +951,9 @@ pub const BeamNode = struct {
909951
// Sweep timed-out RPC requests to prevent sync stalls from non-responsive peers
910952
self.sweepTimedOutRequests();
911953

954+
// Stall detection: if head hasn't advanced while behind peers, we may be stuck
955+
self.checkSyncStall();
956+
912957
if (self.validator) |*validator| {
913958
// we also tick validator per interval in case it would
914959
// need to sync its future duties when its an independent validator
@@ -993,6 +1038,46 @@ pub const BeamNode = struct {
9931038
}
9941039
}
9951040

1041+
/// Detects sync stalls: when head hasn't advanced while we're behind peers
1042+
fn checkSyncStall(self: *Self) void {
1043+
const current_head_slot = self.chain.forkChoice.head.slot;
1044+
const cached_blocks = self.network.fetched_blocks.count();
1045+
1046+
// If head advanced, reset stall counter
1047+
if (current_head_slot > self.last_checked_head_slot) {
1048+
self.last_checked_head_slot = current_head_slot;
1049+
self.stall_interval_count = 0;
1050+
return;
1051+
}
1052+
1053+
// Check if we're behind peers
1054+
const sync_status = self.chain.getSyncStatus();
1055+
const is_behind = switch (sync_status) {
1056+
.behind_peers, .fork_diverged => true,
1057+
.synced, .no_peers => false,
1058+
};
1059+
1060+
if (!is_behind) {
1061+
self.stall_interval_count = 0;
1062+
return;
1063+
}
1064+
1065+
// Increment stall counter
1066+
self.stall_interval_count += 1;
1067+
1068+
// Log warning every 60 intervals (~60 seconds with 1s intervals)
1069+
// and if we have significant cached blocks (indicating sync attempts)
1070+
if (self.stall_interval_count > 0 and self.stall_interval_count % 60 == 0) {
1071+
self.logger.err("SYNC STALL DETECTED: head stuck at slot {d} for {d} intervals, cached_blocks={d}, forkchoice_nodes={d}. Consider checkpoint sync.", .{
1072+
current_head_slot,
1073+
self.stall_interval_count,
1074+
cached_blocks,
1075+
self.chain.forkChoice.getNodeCount(),
1076+
});
1077+
// TODO: After prolonged stall (e.g., 5+ minutes), automatically trigger checkpoint sync.
1078+
}
1079+
}
1080+
9961081
pub fn publishBlock(self: *Self, signed_block: types.SignedBlockWithAttestation) !void {
9971082
const block = signed_block.message.block;
9981083

pkgs/node/src/validator_client.zig

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,15 @@ pub const ValidatorClient = struct {
117117
});
118118
return null;
119119
},
120+
.fork_diverged => |info| {
121+
self.logger.err("skipping block production for slot={d} proposer={d}: FORK DIVERGED (our_finalized={d}, peer_finalized={d})", .{
122+
slot,
123+
slot_proposer_id,
124+
info.our_finalized_slot,
125+
info.peer_finalized_slot,
126+
});
127+
return null;
128+
},
120129
}
121130

122131
// 1. construct the block
@@ -184,6 +193,14 @@ pub const ValidatorClient = struct {
184193
});
185194
return null;
186195
},
196+
.fork_diverged => |info| {
197+
self.logger.err("skipping attestation production for slot={d}: FORK DIVERGED (our_finalized={d}, peer_finalized={d})", .{
198+
slot,
199+
info.our_finalized_slot,
200+
info.peer_finalized_slot,
201+
});
202+
return null;
203+
},
187204
}
188205

189206
const slot_proposer_id = self.getSlotProposer(slot);

0 commit comments

Comments
 (0)