@@ -14,7 +14,7 @@ use std::{
14
14
} ;
15
15
16
16
use linera_base:: ensure;
17
- use rocksdb:: { BlockBasedOptions , Cache , DBCompactionStyle } ;
17
+ use rocksdb:: { BlockBasedOptions , Cache , DBCompactionStyle , SliceTransform } ;
18
18
use serde:: { Deserialize , Serialize } ;
19
19
use sysinfo:: { CpuRefreshKind , MemoryRefreshKind , RefreshKind , System } ;
20
20
use tempfile:: TempDir ;
@@ -171,20 +171,39 @@ impl RocksDbStoreExecutor {
171
171
key_prefix : Vec < u8 > ,
172
172
) -> Result < Vec < Vec < u8 > > , RocksDbStoreInternalError > {
173
173
check_key_size ( & key_prefix) ?;
174
+
174
175
let mut prefix = self . start_key . clone ( ) ;
175
176
prefix. extend ( key_prefix) ;
176
177
let len = prefix. len ( ) ;
177
- let mut iter = self . db . raw_iterator ( ) ;
178
+
179
+ // Configure ReadOptions optimized for SSDs and iterator performance
180
+ let mut read_opts = rocksdb:: ReadOptions :: default ( ) ;
181
+ // Enable async I/O for better concurrency
182
+ read_opts. set_async_io ( true ) ;
183
+
184
+ // Set precise upper bound to minimize key traversal
185
+ let mut upper_bound = prefix. clone ( ) ;
186
+ if let Some ( last_byte) = upper_bound. last_mut ( ) {
187
+ if * last_byte < 255 {
188
+ * last_byte += 1 ;
189
+ read_opts. set_iterate_upper_bound ( upper_bound) ;
190
+ }
191
+ }
192
+
193
+ let mut iter = self . db . raw_iterator_opt ( read_opts) ;
178
194
let mut keys = Vec :: new ( ) ;
195
+
179
196
iter. seek ( & prefix) ;
180
- let mut next_key = iter. key ( ) ;
181
- while let Some ( key) = next_key {
182
- if !key. starts_with ( & prefix) {
197
+ while iter. valid ( ) {
198
+ if let Some ( key) = iter. key ( ) {
199
+ if !key. starts_with ( & prefix) {
200
+ break ;
201
+ }
202
+ keys. push ( key[ len..] . to_vec ( ) ) ;
203
+ } else {
183
204
break ;
184
205
}
185
- keys. push ( key[ len..] . to_vec ( ) ) ;
186
206
iter. next ( ) ;
187
- next_key = iter. key ( ) ;
188
207
}
189
208
Ok ( keys)
190
209
}
@@ -198,20 +217,36 @@ impl RocksDbStoreExecutor {
198
217
let mut prefix = self . start_key . clone ( ) ;
199
218
prefix. extend ( key_prefix) ;
200
219
let len = prefix. len ( ) ;
201
- let mut iter = self . db . raw_iterator ( ) ;
220
+
221
+ // Configure ReadOptions optimized for SSDs and iterator performance
222
+ let mut read_opts = rocksdb:: ReadOptions :: default ( ) ;
223
+ // Enable async I/O for better concurrency
224
+ read_opts. set_async_io ( true ) ;
225
+
226
+ let mut upper_bound = prefix. clone ( ) ;
227
+ if let Some ( last_byte) = upper_bound. last_mut ( ) {
228
+ if * last_byte < 255 {
229
+ * last_byte += 1 ;
230
+ read_opts. set_iterate_upper_bound ( upper_bound) ;
231
+ }
232
+ }
233
+
234
+ let mut iter = self . db . raw_iterator_opt ( read_opts) ;
202
235
let mut key_values = Vec :: new ( ) ;
203
236
iter. seek ( & prefix) ;
204
- let mut next_key = iter. key ( ) ;
205
- while let Some ( key) = next_key {
206
- if !key. starts_with ( & prefix) {
237
+ while iter. valid ( ) {
238
+ if let Some ( key) = iter. key ( ) {
239
+ if !key. starts_with ( & prefix) {
240
+ break ;
241
+ }
242
+ if let Some ( value) = iter. value ( ) {
243
+ let key_value = ( key[ len..] . to_vec ( ) , value. to_vec ( ) ) ;
244
+ key_values. push ( key_value) ;
245
+ }
246
+ } else {
207
247
break ;
208
248
}
209
- if let Some ( value) = iter. value ( ) {
210
- let key_value = ( key[ len..] . to_vec ( ) , value. to_vec ( ) ) ;
211
- key_values. push ( key_value) ;
212
- }
213
249
iter. next ( ) ;
214
- next_key = iter. key ( ) ;
215
250
}
216
251
Ok ( key_values)
217
252
}
@@ -373,8 +408,32 @@ impl RocksDbStoreInternal {
373
408
total_ram / 4 ,
374
409
HYPER_CLOCK_CACHE_BLOCK_SIZE ,
375
410
) ) ;
411
+
412
+ // Configure bloom filters for prefix iteration optimization
413
+ block_options. set_bloom_filter ( 10.0 , false ) ;
414
+ block_options. set_whole_key_filtering ( false ) ;
415
+
416
+ // 32KB blocks instead of default 4KB - reduces iterator seeks
417
+ block_options. set_block_size ( 32 * 1024 ) ;
418
+ // Use latest format for better compression and performance
419
+ block_options. set_format_version ( 5 ) ;
420
+
376
421
options. set_block_based_table_factory ( & block_options) ;
377
422
423
+ // Configure prefix extraction for bloom filter optimization
424
+ // Use 8 bytes: ROOT_KEY_DOMAIN (1 byte) + BCS variant (1-2 bytes) + identifier start (4-5 bytes)
425
+ let prefix_extractor = SliceTransform :: create_fixed_prefix ( 8 ) ;
426
+ options. set_prefix_extractor ( prefix_extractor) ;
427
+
428
+ // 12.5% of memtable size for bloom filter
429
+ options. set_memtable_prefix_bloom_ratio ( 0.125 ) ;
430
+ // Skip bloom filter for memtable when key exists
431
+ options. set_optimize_filters_for_hits ( true ) ;
432
+ // Use memory-mapped files for faster reads
433
+ options. set_allow_mmap_reads ( true ) ;
434
+ // Don't use random access pattern since we do prefix scans
435
+ options. set_advise_random_on_open ( false ) ;
436
+
378
437
let db = DB :: open ( & options, path_buf) ?;
379
438
let executor = RocksDbStoreExecutor {
380
439
db : Arc :: new ( db) ,
0 commit comments