Skip to content

Commit c18be16

Browse files
committed
Optimize RocksDB prefix searches
1 parent e2cad30 commit c18be16

File tree

1 file changed

+75
-16
lines changed

1 file changed

+75
-16
lines changed

linera-views/src/backends/rocks_db.rs

Lines changed: 75 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use std::{
1414
};
1515

1616
use linera_base::ensure;
17-
use rocksdb::{BlockBasedOptions, Cache, DBCompactionStyle};
17+
use rocksdb::{BlockBasedOptions, Cache, DBCompactionStyle, SliceTransform};
1818
use serde::{Deserialize, Serialize};
1919
use sysinfo::{CpuRefreshKind, MemoryRefreshKind, RefreshKind, System};
2020
use tempfile::TempDir;
@@ -171,20 +171,39 @@ impl RocksDbStoreExecutor {
171171
key_prefix: Vec<u8>,
172172
) -> Result<Vec<Vec<u8>>, RocksDbStoreInternalError> {
173173
check_key_size(&key_prefix)?;
174+
174175
let mut prefix = self.start_key.clone();
175176
prefix.extend(key_prefix);
176177
let len = prefix.len();
177-
let mut iter = self.db.raw_iterator();
178+
179+
// Configure ReadOptions optimized for SSDs and iterator performance
180+
let mut read_opts = rocksdb::ReadOptions::default();
181+
// Enable async I/O for better concurrency
182+
read_opts.set_async_io(true);
183+
184+
// Set precise upper bound to minimize key traversal
185+
let mut upper_bound = prefix.clone();
186+
if let Some(last_byte) = upper_bound.last_mut() {
187+
if *last_byte < 255 {
188+
*last_byte += 1;
189+
read_opts.set_iterate_upper_bound(upper_bound);
190+
}
191+
}
192+
193+
let mut iter = self.db.raw_iterator_opt(read_opts);
178194
let mut keys = Vec::new();
195+
179196
iter.seek(&prefix);
180-
let mut next_key = iter.key();
181-
while let Some(key) = next_key {
182-
if !key.starts_with(&prefix) {
197+
while iter.valid() {
198+
if let Some(key) = iter.key() {
199+
if !key.starts_with(&prefix) {
200+
break;
201+
}
202+
keys.push(key[len..].to_vec());
203+
} else {
183204
break;
184205
}
185-
keys.push(key[len..].to_vec());
186206
iter.next();
187-
next_key = iter.key();
188207
}
189208
Ok(keys)
190209
}
@@ -198,20 +217,36 @@ impl RocksDbStoreExecutor {
198217
let mut prefix = self.start_key.clone();
199218
prefix.extend(key_prefix);
200219
let len = prefix.len();
201-
let mut iter = self.db.raw_iterator();
220+
221+
// Configure ReadOptions optimized for SSDs and iterator performance
222+
let mut read_opts = rocksdb::ReadOptions::default();
223+
// Enable async I/O for better concurrency
224+
read_opts.set_async_io(true);
225+
226+
let mut upper_bound = prefix.clone();
227+
if let Some(last_byte) = upper_bound.last_mut() {
228+
if *last_byte < 255 {
229+
*last_byte += 1;
230+
read_opts.set_iterate_upper_bound(upper_bound);
231+
}
232+
}
233+
234+
let mut iter = self.db.raw_iterator_opt(read_opts);
202235
let mut key_values = Vec::new();
203236
iter.seek(&prefix);
204-
let mut next_key = iter.key();
205-
while let Some(key) = next_key {
206-
if !key.starts_with(&prefix) {
237+
while iter.valid() {
238+
if let Some(key) = iter.key() {
239+
if !key.starts_with(&prefix) {
240+
break;
241+
}
242+
if let Some(value) = iter.value() {
243+
let key_value = (key[len..].to_vec(), value.to_vec());
244+
key_values.push(key_value);
245+
}
246+
} else {
207247
break;
208248
}
209-
if let Some(value) = iter.value() {
210-
let key_value = (key[len..].to_vec(), value.to_vec());
211-
key_values.push(key_value);
212-
}
213249
iter.next();
214-
next_key = iter.key();
215250
}
216251
Ok(key_values)
217252
}
@@ -373,8 +408,32 @@ impl RocksDbStoreInternal {
373408
total_ram / 4,
374409
HYPER_CLOCK_CACHE_BLOCK_SIZE,
375410
));
411+
412+
// Configure bloom filters for prefix iteration optimization
413+
block_options.set_bloom_filter(10.0, false);
414+
block_options.set_whole_key_filtering(false);
415+
416+
// 32KB blocks instead of default 4KB - reduces iterator seeks
417+
block_options.set_block_size(32 * 1024);
418+
// Use latest format for better compression and performance
419+
block_options.set_format_version(5);
420+
376421
options.set_block_based_table_factory(&block_options);
377422

423+
// Configure prefix extraction for bloom filter optimization
424+
// Use 8 bytes: ROOT_KEY_DOMAIN (1 byte) + BCS variant (1-2 bytes) + identifier start (4-5 bytes)
425+
let prefix_extractor = SliceTransform::create_fixed_prefix(8);
426+
options.set_prefix_extractor(prefix_extractor);
427+
428+
// 12.5% of memtable size for bloom filter
429+
options.set_memtable_prefix_bloom_ratio(0.125);
430+
// Skip bloom filter for memtable when key exists
431+
options.set_optimize_filters_for_hits(true);
432+
// Use memory-mapped files for faster reads
433+
options.set_allow_mmap_reads(true);
434+
// Don't use random access pattern since we do prefix scans
435+
options.set_advise_random_on_open(false);
436+
378437
let db = DB::open(&options, path_buf)?;
379438
let executor = RocksDbStoreExecutor {
380439
db: Arc::new(db),

0 commit comments

Comments
 (0)