-
Notifications
You must be signed in to change notification settings - Fork 952
Consolidate reqresp_pre_import_cache into data_availability_checker
#8045
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f2d5e92
5f2e85c
092c899
ab49afa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -340,10 +340,6 @@ pub enum BlockProcessStatus<E: EthSpec> { | |
| ExecutionValidated(Arc<SignedBeaconBlock<E>>), | ||
| } | ||
|
|
||
| pub struct BeaconChainMetrics { | ||
| pub reqresp_pre_import_cache_len: usize, | ||
| } | ||
|
|
||
| pub type LightClientProducerEvent<T> = (Hash256, Slot, SyncAggregate<T>); | ||
|
|
||
| pub type BeaconForkChoice<T> = ForkChoice< | ||
|
|
@@ -363,9 +359,6 @@ pub type BeaconStore<T> = Arc< | |
| >, | ||
| >; | ||
|
|
||
| /// Cache gossip verified blocks to serve over ReqResp before they are imported | ||
| type ReqRespPreImportCache<E> = HashMap<Hash256, Arc<SignedBeaconBlock<E>>>; | ||
|
|
||
| /// Represents the "Beacon Chain" component of Ethereum 2.0. Allows import of blocks and block | ||
| /// operations and chooses a canonical head. | ||
| pub struct BeaconChain<T: BeaconChainTypes> { | ||
|
|
@@ -462,8 +455,6 @@ pub struct BeaconChain<T: BeaconChainTypes> { | |
| pub(crate) attester_cache: Arc<AttesterCache>, | ||
| /// A cache used when producing attestations whilst the head block is still being imported. | ||
| pub early_attester_cache: EarlyAttesterCache<T::EthSpec>, | ||
| /// Cache gossip verified blocks to serve over ReqResp before they are imported | ||
| pub reqresp_pre_import_cache: Arc<RwLock<ReqRespPreImportCache<T::EthSpec>>>, | ||
| /// A cache used to keep track of various block timings. | ||
| pub block_times_cache: Arc<RwLock<BlockTimesCache>>, | ||
| /// A cache used to track pre-finalization block roots for quick rejection. | ||
|
|
@@ -1289,18 +1280,8 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| /// chain. Used by sync to learn the status of a block and prevent repeated downloads / | ||
| /// processing attempts. | ||
| pub fn get_block_process_status(&self, block_root: &Hash256) -> BlockProcessStatus<T::EthSpec> { | ||
| if let Some(block) = self | ||
| .data_availability_checker | ||
| .get_execution_valid_block(block_root) | ||
| { | ||
| return BlockProcessStatus::ExecutionValidated(block); | ||
| } | ||
|
|
||
| if let Some(block) = self.reqresp_pre_import_cache.read().get(block_root) { | ||
| // A block is on the `reqresp_pre_import_cache` but NOT in the | ||
| // `data_availability_checker` only if it is actively processing. We can expect a future | ||
| // event with the result of processing | ||
| return BlockProcessStatus::NotValidated(block.clone()); | ||
| if let Some(cached_block) = self.data_availability_checker.get_cached_block(block_root) { | ||
| return cached_block; | ||
| } | ||
|
|
||
| BlockProcessStatus::Unknown | ||
|
|
@@ -3054,8 +3035,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
|
|
||
| self.emit_sse_blob_sidecar_events(&block_root, std::iter::once(blob.as_blob())); | ||
|
|
||
| let r = self.check_gossip_blob_availability_and_import(blob).await; | ||
| self.remove_notified(&block_root, r) | ||
| self.check_gossip_blob_availability_and_import(blob).await | ||
| } | ||
|
|
||
| /// Cache the data columns in the processing cache, process it, then evict it from the cache if it was | ||
|
|
@@ -3092,15 +3072,13 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| data_columns.iter().map(|column| column.as_data_column()), | ||
| ); | ||
|
|
||
| let r = self | ||
| .check_gossip_data_columns_availability_and_import( | ||
| slot, | ||
| block_root, | ||
| data_columns, | ||
| publish_fn, | ||
| ) | ||
| .await; | ||
| self.remove_notified(&block_root, r) | ||
| self.check_gossip_data_columns_availability_and_import( | ||
| slot, | ||
| block_root, | ||
| data_columns, | ||
| publish_fn, | ||
| ) | ||
| .await | ||
| } | ||
|
|
||
| /// Cache the blobs in the processing cache, process it, then evict it from the cache if it was | ||
|
|
@@ -3139,10 +3117,8 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
|
|
||
| self.emit_sse_blob_sidecar_events(&block_root, blobs.iter().flatten().map(Arc::as_ref)); | ||
|
|
||
| let r = self | ||
| .check_rpc_blob_availability_and_import(slot, block_root, blobs) | ||
| .await; | ||
| self.remove_notified(&block_root, r) | ||
| self.check_rpc_blob_availability_and_import(slot, block_root, blobs) | ||
| .await | ||
| } | ||
|
|
||
| /// Process blobs retrieved from the EL and returns the `AvailabilityProcessingStatus`. | ||
|
|
@@ -3174,10 +3150,8 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| } | ||
| } | ||
|
|
||
| let r = self | ||
| .check_engine_blobs_availability_and_import(slot, block_root, engine_get_blobs_output) | ||
| .await; | ||
| self.remove_notified(&block_root, r) | ||
| self.check_engine_blobs_availability_and_import(slot, block_root, engine_get_blobs_output) | ||
| .await | ||
| } | ||
|
|
||
| fn emit_sse_blob_sidecar_events<'a, I>(self: &Arc<Self>, block_root: &Hash256, blobs_iter: I) | ||
|
|
@@ -3270,10 +3244,8 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| custody_columns.iter().map(|column| column.as_ref()), | ||
| ); | ||
|
|
||
| let r = self | ||
| .check_rpc_custody_columns_availability_and_import(slot, block_root, custody_columns) | ||
| .await; | ||
| self.remove_notified(&block_root, r) | ||
| self.check_rpc_custody_columns_availability_and_import(slot, block_root, custody_columns) | ||
| .await | ||
| } | ||
|
|
||
| pub async fn reconstruct_data_columns( | ||
|
|
@@ -3320,10 +3292,8 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| return Ok(None); | ||
| }; | ||
|
|
||
| let r = self | ||
| .process_availability(slot, availability, || Ok(())) | ||
| .await; | ||
| self.remove_notified(&block_root, r) | ||
| self.process_availability(slot, availability, || Ok(())) | ||
| .await | ||
| .map(|availability_processing_status| { | ||
| Some((availability_processing_status, data_columns_to_publish)) | ||
| }) | ||
|
|
@@ -3340,46 +3310,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| } | ||
| } | ||
|
|
||
| /// Remove any block components from the *processing cache* if we no longer require them. If the | ||
| /// block was imported full or erred, we no longer require them. | ||
| fn remove_notified( | ||
| &self, | ||
| block_root: &Hash256, | ||
| r: Result<AvailabilityProcessingStatus, BlockError>, | ||
| ) -> Result<AvailabilityProcessingStatus, BlockError> { | ||
| let has_missing_components = | ||
| matches!(r, Ok(AvailabilityProcessingStatus::MissingComponents(_, _))); | ||
| if !has_missing_components { | ||
| self.reqresp_pre_import_cache.write().remove(block_root); | ||
| } | ||
| r | ||
| } | ||
|
|
||
| /// Wraps `process_block` in logic to cache the block's commitments in the processing cache | ||
| /// and evict if the block was imported or errored. | ||
| pub async fn process_block_with_early_caching<B: IntoExecutionPendingBlock<T>>( | ||
| self: &Arc<Self>, | ||
| block_root: Hash256, | ||
| unverified_block: B, | ||
| block_source: BlockImportSource, | ||
| notify_execution_layer: NotifyExecutionLayer, | ||
| ) -> Result<AvailabilityProcessingStatus, BlockError> { | ||
| self.reqresp_pre_import_cache | ||
| .write() | ||
| .insert(block_root, unverified_block.block_cloned()); | ||
|
|
||
| let r = self | ||
| .process_block( | ||
| block_root, | ||
| unverified_block, | ||
| notify_execution_layer, | ||
| block_source, | ||
| || Ok(()), | ||
| ) | ||
| .await; | ||
| self.remove_notified(&block_root, r) | ||
| } | ||
|
|
||
| /// Check for known and configured invalid block roots before processing. | ||
| pub fn check_invalid_block_roots(&self, block_root: Hash256) -> Result<(), BlockError> { | ||
| if self.config.invalid_block_roots.contains(&block_root) { | ||
|
|
@@ -3411,12 +3341,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| block_source: BlockImportSource, | ||
| publish_fn: impl FnOnce() -> Result<(), BlockError>, | ||
| ) -> Result<AvailabilityProcessingStatus, BlockError> { | ||
| // Start the Prometheus timer. | ||
| let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); | ||
|
|
||
| // Increment the Prometheus counter for block processing requests. | ||
| metrics::inc_counter(&metrics::BLOCK_PROCESSING_REQUESTS); | ||
|
|
||
| let block_slot = unverified_block.block().slot(); | ||
|
|
||
| // Set observed time if not already set. Usually this should be set by gossip or RPC, | ||
|
|
@@ -3431,6 +3355,15 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| ); | ||
| } | ||
|
|
||
| self.data_availability_checker | ||
| .put_pre_execution_block(block_root, unverified_block.block_cloned())?; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Noting that if the block is unverified, so it can be 10 MB. The current max size of the cache is 32 blocks so there's no risk of OOM. |
||
|
|
||
| // Start the Prometheus timer. | ||
| let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); | ||
|
|
||
| // Increment the Prometheus counter for block processing requests. | ||
| metrics::inc_counter(&metrics::BLOCK_PROCESSING_REQUESTS); | ||
|
|
||
| // A small closure to group the verification and import errors. | ||
| let chain = self.clone(); | ||
| let import_block = async move { | ||
|
|
@@ -3448,7 +3381,18 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| .set_time_consensus_verified(block_root, block_slot, timestamp) | ||
| } | ||
|
|
||
| let executed_block = chain.into_executed_block(execution_pending).await?; | ||
| let executed_block = chain | ||
| .into_executed_block(execution_pending) | ||
| .await | ||
| .inspect_err(|_| { | ||
| // If the block fails execution for whatever reason (e.g. engine offline), | ||
| // and we keep it in the cache, then the node will NOT perform lookup and | ||
| // reprocess this block until the block is evicted from DA checker, causing the | ||
| // chain to get stuck temporarily if the block is canonical. Therefore we remove | ||
| // it from the cache if execution fails. | ||
| self.data_availability_checker | ||
| .remove_block_on_execution_error(&block_root); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree its safer to just chuck it out of the cache and let lookup fetch the block regardless of the type of the execution error. |
||
| })?; | ||
|
|
||
| // Record the *additional* time it took to wait for execution layer verification. | ||
| if let Some(timestamp) = self.slot_clock.now_duration() { | ||
|
|
@@ -3574,9 +3518,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| block: AvailabilityPendingExecutedBlock<T::EthSpec>, | ||
| ) -> Result<AvailabilityProcessingStatus, BlockError> { | ||
| let slot = block.block.slot(); | ||
| let availability = self | ||
| .data_availability_checker | ||
| .put_pending_executed_block(block)?; | ||
| let availability = self.data_availability_checker.put_executed_block(block)?; | ||
| self.process_availability(slot, availability, || Ok(())) | ||
| .await | ||
| } | ||
|
|
@@ -7156,12 +7098,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> { | |
| ) | ||
| } | ||
|
|
||
| pub fn metrics(&self) -> BeaconChainMetrics { | ||
| BeaconChainMetrics { | ||
| reqresp_pre_import_cache_len: self.reqresp_pre_import_cache.read().len(), | ||
| } | ||
| } | ||
|
|
||
| pub(crate) fn get_blobs_or_columns_store_op( | ||
| &self, | ||
| block_root: Hash256, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still need to handle the case where block import was errored and remove them from cache.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The damage to leave the invalid block in the cache is that we might serve the block over RPC after knowing it's invalid. Consider the case where we remove the block from the da_checker after noticing it's invalid and then someone triggers lookup sync and we fetch it again. We will serve the block over RPC while verification is happening for the second time.
I don't see that big of an issue with leaving the invalid block in the da_checker until LRU evicts it. Otherwise, we can leave it there but mark it as "INVALID"
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thinking out loud -
If the block is invalid we will downscore the peer.
If we leave the invalid block in the cache
If we remove the invalid block from the cache
Yes agree. I actually think leaving it in the cache to prevent the lookup is better.
A block that does not transition into executed block will never become available so we don't really need to mark them I believe.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for raising these good points, I'm now inclined to leave it as it is, as it doesn't seem to help us that much by removing it, as we don't really benefit from doing it for extra code and complexity.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding more thoughts after seeing the failed
block_in_processing_cache_becomes_invalidtest.If the block fails execution initially for whatever reason (e.g. engine offline), and we keep it in the cache, then there's a chance that the chain might get stuck, because we will never perform lookup until the block is evicted from the cache. I think this is not great, I'll add code to remove it so we re-trigger a new lookup in this case.
if it is indeed an invalid block, peer will get penalised for getting us to do the lookup.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removing failed execution block in 5f2e85c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to be clear, is this because we cannot re-trigger the execution of a block sitting in the da checker? not fully clear on this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes exactly - because lookup will not re-download the block if it's in the DA checker:
lighthouse/beacon_node/network/src/sync/network_context.rs
Lines 835 to 853 in ee734d1
so if it fails the first time because of unexpected error, e.g. EL timeout, then it doesn't gets re-downloaded, until we fallback to range sync