Skip to content

Block Database #4027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
260979a
blockdb setup & readme
DracoLi Jun 1, 2025
64ca7f1
feat: block db implementation & readme
DracoLi Jun 9, 2025
cf35473
refactor: rename store to database
DracoLi Jun 9, 2025
15ae1d1
feat: add tests and update blockdb to have separate methods to read h…
DracoLi Jun 22, 2025
c6989b0
feat: data splitting & fix linting
DracoLi Jun 23, 2025
c1bcf97
fix: close db before deleting the file
DracoLi Jun 23, 2025
4201549
fix: recovery issues with data files splitting & feedback
DracoLi Jun 26, 2025
9a90669
use lru for file cache and fix recovery issues
DracoLi Jun 30, 2025
decbfe8
refactor: use t.TempDir
DracoLi Jun 30, 2025
f08b7a7
fix: cache test
DracoLi Jul 2, 2025
cb900cf
Merge branch 'master' into dl/blockdb
DracoLi Jul 2, 2025
dd98830
refactor: move database methods to database.go
DracoLi Jul 6, 2025
e1aa481
rename blockHeader -> blockEntryHeader and improve recovery logic
DracoLi Jul 6, 2025
4107837
make MaxDataFiles configurable
DracoLi Jul 6, 2025
4d2822b
add more logging
DracoLi Jul 6, 2025
ef3fbb0
move data and index dir to config and rename config
DracoLi Jul 6, 2025
dcde3ee
fix lint
DracoLi Jul 7, 2025
4bf1935
fix struct alignment and add tests
DracoLi Jul 7, 2025
6392142
fix: separate errors for directories
DracoLi Jul 9, 2025
9eb635a
consistent block height tracking
DracoLi Jul 9, 2025
5f018b4
remove truncate config
DracoLi Jul 10, 2025
b9b8fa8
add additional tests
DracoLi Jul 10, 2025
80e01d2
fix lint and improve test error msg
DracoLi Jul 10, 2025
4752af3
remove assertion in go routine
DracoLi Jul 10, 2025
abce7c2
Merge branch 'master' into dl/blockdb
DracoLi Jul 10, 2025
2530a83
limit concurrent calls to persistIndexHeader
DracoLi Jul 11, 2025
c20a237
add warning log if config values differ from index header
DracoLi Jul 11, 2025
68f411f
change warn logs to info
DracoLi Jul 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions cache/lru/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ type Cache[K comparable, V any] struct {
lock sync.Mutex
elements *linked.Hashmap[K, V]
size int

// onEvict is called with the key and value of an evicted entry, if set.
onEvict func(K, V)
}

// SetOnEvict sets a callback to be called with the key and value of an evicted entry.
// The callback is called synchronously while holding the cache lock.
func (c *Cache[K, V]) SetOnEvict(cb func(K, V)) {
c.lock.Lock()
defer c.lock.Unlock()
c.onEvict = cb
}

func NewCache[K comparable, V any](size int) *Cache[K, V] {
Expand All @@ -34,8 +45,11 @@ func (c *Cache[K, V]) Put(key K, value V) {
defer c.lock.Unlock()

if c.elements.Len() == c.size {
oldestKey, _, _ := c.elements.Oldest()
oldestKey, oldestValue, _ := c.elements.Oldest()
c.elements.Delete(oldestKey)
if c.onEvict != nil {
c.onEvict(oldestKey, oldestValue)
}
}
c.elements.Put(key, value)
}
Expand All @@ -55,14 +69,25 @@ func (c *Cache[K, V]) Get(key K) (V, bool) {
func (c *Cache[K, _]) Evict(key K) {
c.lock.Lock()
defer c.lock.Unlock()

value, _ := c.elements.Get(key)
c.elements.Delete(key)
if c.onEvict != nil {
c.onEvict(key, value)
}
}

func (c *Cache[_, _]) Flush() {
c.lock.Lock()
defer c.lock.Unlock()

// Call onEvict for each element before clearing
if c.onEvict != nil {
iter := c.elements.NewIterator()
for iter.Next() {
c.onEvict(iter.Key(), iter.Value())
}
}

c.elements.Clear()
}

Expand Down
16 changes: 16 additions & 0 deletions cache/lru/cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ package lru
import (
"testing"

"github.com/stretchr/testify/require"

"github.com/ava-labs/avalanchego/cache/cachetest"
"github.com/ava-labs/avalanchego/ids"
)
Expand All @@ -19,3 +21,17 @@ func TestCacheEviction(t *testing.T) {
c := NewCache[ids.ID, int64](2)
cachetest.Eviction(t, c)
}

func TestCacheFlushWithOnEvict(t *testing.T) {
c := NewCache[ids.ID, int64](2)

// Track which elements were evicted
evicted := make(map[ids.ID]int64)
c.SetOnEvict(func(key ids.ID, value int64) {
evicted[key] = value
})

cachetest.Eviction(t, c)
require.Zero(t, c.Len())
require.Len(t, evicted, 3)
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ require (
github.com/ava-labs/ledger-avalanche/go v0.0.0-20241009183145-e6f90a8a1a60
github.com/ava-labs/libevm v1.13.14-0.3.0.rc.1
github.com/btcsuite/btcd/btcutil v1.1.3
github.com/cespare/xxhash/v2 v2.3.0
github.com/cockroachdb/pebble v0.0.0-20230928194634-aa077af62593
github.com/compose-spec/compose-go v1.20.2
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0
Expand Down Expand Up @@ -89,7 +90,6 @@ require (
github.com/bits-and-blooms/bitset v1.10.0 // indirect
github.com/btcsuite/btcd/btcec/v2 v2.3.2 // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cockroachdb/errors v1.9.1 // indirect
github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect
github.com/cockroachdb/redact v1.1.3 // indirect
Expand Down
196 changes: 196 additions & 0 deletions x/blockdb/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# BlockDB

BlockDB is a specialized database optimized for blockchain blocks.

## Key Functionalities

- **O(1) Performance**: Both reads and writes complete in constant time
- **Parallel Operations**: Multiple threads can read and write blocks concurrently without blocking
- **Flexible Write Ordering**: Supports out-of-order block writes for bootstrapping
- **Configurable Durability**: Optional `syncToDisk` mode guarantees immediate recoverability
- **Automatic Recovery**: Detects and recovers unindexed blocks after unclean shutdowns

## Design

BlockDB uses a single index file and multiple data files. The index file maps block heights to locations in the data files, while data files store the actual block content. Data storage can be split across multiple data files based on the maximum data file size.

```
┌─────────────────┐ ┌─────────────────┐
│ Index File │ │ Data File 1 │
│ (.idx) │ │ (.dat) │
├─────────────────┤ ├─────────────────┤
│ Header │ │ Block 0 │
│ - Version │ ┌─────>│ - Header │
│ - Min Height │ │ │ - Data │
│ - Max Height │ │ ├─────────────────┤
│ - Data Size │ │ │ Block 1 │
│ - ... │ │ ┌──>│ - Header │
├─────────────────┤ │ │ │ - Data │
│ Entry[0] │ │ │ ├─────────────────┤
│ - Offset ───────┼──┘ │ │ ... │
│ - Size │ │ └─────────────────┘
│ - Header Size │ │
├─────────────────┤ │
│ Entry[1] │ │
│ - Offset ───────┼─────┘
│ - Size │
│ - Header Size │
├─────────────────┤
│ ... │
└─────────────────┘
```

### File Formats

#### Index File Structure

The index file consists of a fixed-size header followed by fixed-size entries:

```
Index File Header (64 bytes):
┌────────────────────────────────┬─────────┐
│ Field │ Size │
├────────────────────────────────┼─────────┤
│ Version │ 8 bytes │
│ Max Data File Size │ 8 bytes │
│ Min Block Height │ 8 bytes │
│ Max Contiguous Height │ 8 bytes │
│ Max Block Height │ 8 bytes │
│ Next Write Offset │ 8 bytes │
│ Reserved │ 16 bytes│
└────────────────────────────────┴─────────┘

Index Entry (16 bytes):
┌────────────────────────────────┬─────────┐
│ Field │ Size │
├────────────────────────────────┼─────────┤
│ Data File Offset │ 8 bytes │
│ Block Data Size │ 4 bytes │
│ Header Size │ 4 bytes │
└────────────────────────────────┴─────────┘
```

#### Data File Structure

Each block in the data file is stored with a block entry header followed by the raw block data:

```
Block Entry Header (26 bytes):
┌────────────────────────────────┬─────────┐
│ Field │ Size │
├────────────────────────────────┼─────────┤
│ Height │ 8 bytes │
│ Size │ 4 bytes │
│ Checksum │ 8 bytes │
│ Header Size │ 4 bytes │
│ Version │ 2 bytes │
└────────────────────────────────┴─────────┘
```

### Block Overwrites

BlockDB allows overwriting blocks at existing heights. When a block is overwritten, the new block is appended to the data file and the index entry is updated to point to the new location, leaving the old block data as unreferenced "dead" space. However, since blocks are immutable and rarely overwritten (e.g., during reorgs), this trade-off should have minimal impact in practice.

### Fixed-Size Index Entries

Each index entry is exactly 16 bytes on disk, containing the offset, size, and header size. This fixed size enables direct calculation of where each block's index entry is located, providing O(1) lookups. For blockchains with high block heights, the index remains efficient, even at height 1 billion, the index file would only be ~16GB.

### Durability and Fsync Behavior

BlockDB provides configurable durability through the `syncToDisk` parameter:

**Data File Behavior:**

- **When `syncToDisk=true`**: The data file is fsync'd after every block write, guaranteeing durability against both process failures and kernel/machine failures.
- **When `syncToDisk=false`**: Data file writes are buffered, providing durability against process failures but not against kernel or machine failures.

**Index File Behavior:**

- **When `syncToDisk=true`**: The index file is fsync'd every `CheckpointInterval` blocks (when the header is written).
- **When `syncToDisk=false`**: The index file relies on OS buffering and is not explicitly fsync'd.

### Recovery Mechanism

On startup, BlockDB checks for signs of an unclean shutdown by comparing the data file size on disk with the indexed data size stored in the index file header. If the data files are larger than what the index claims, it indicates that blocks were written but the index wasn't properly updated before shutdown.

**Recovery Process:**

1. Starts scanning from where the index left off (`NextWriteOffset`)
2. For each unindexed block found:
- Validates the block entry header and checksum
- Writes the corresponding index entry
3. Calculates the max contiguous height and max block height
4. Updates the index header with the updated max contiguous height, max block height, and next write offset

## Usage

### Creating a Database

```go
import (
"errors"
"github.com/ava-labs/avalanchego/x/blockdb"
)

config := blockdb.DefaultConfig().
WithDir("/path/to/blockdb")
db, err := blockdb.New(config, logging.NoLog{})
if err != nil {
fmt.Println("Error creating database:", err)
return
}
defer db.Close()
```

### Writing and Reading Blocks

```go
// Write a block with header size
height := uint64(100)
blockData := []byte("header:block data")
headerSize := uint32(7) // First 7 bytes are the header
err := db.WriteBlock(height, blockData, headerSize)
if err != nil {
fmt.Println("Error writing block:", err)
return
}

// Read a block
blockData, err := db.ReadBlock(height)
if err != nil {
if errors.Is(err, blockdb.ErrBlockNotFound) {
fmt.Println("Block doesn't exist at this height")
return
}
fmt.Println("Error reading block:", err)
return
}

// Read block components separately
headerData, err := db.ReadHeader(height)
if err != nil {
if errors.Is(err, blockdb.ErrBlockNotFound) {
fmt.Println("Block doesn't exist at this height")
return
}
fmt.Println("Error reading header:", err)
return
}
bodyData, err := db.ReadBody(height)
if err != nil {
if errors.Is(err, blockdb.ErrBlockNotFound) {
fmt.Println("Block doesn't exist at this height")
return
}
fmt.Println("Error reading body:", err)
return
}
```

## TODO

- Implement a block cache for recently accessed blocks
- Use a buffered pool to avoid allocations on reads and writes
- Add metrics
- Add performance benchmarks
- Consider supporting missing data files (currently we error if any data files are missing)
Loading