@@ -88,6 +88,10 @@ type FSM struct {
88
88
storeLatestState bool
89
89
90
90
chunker * raftchunking.ChunkingBatchingFSM
91
+
92
+ // testSnapshotRestoreError is used in tests to simulate an error while
93
+ // restoring a snapshot.
94
+ testSnapshotRestoreError bool
91
95
}
92
96
93
97
// NewFSM constructs a FSM using the given directory
@@ -193,20 +197,20 @@ func (f *FSM) witnessIndex(i *IndexValue) {
193
197
}
194
198
}
195
199
196
- func (f * FSM ) witnessSnapshot (index , term , configurationIndex uint64 , configuration raft.Configuration ) error {
200
+ func (f * FSM ) witnessSnapshot (metadata * raft.SnapshotMeta ) error {
197
201
var indexBytes []byte
198
202
latestIndex , _ := f .LatestState ()
199
203
200
- latestIndex .Index = index
201
- latestIndex .Term = term
204
+ latestIndex .Index = metadata . Index
205
+ latestIndex .Term = metadata . Term
202
206
203
207
var err error
204
208
indexBytes , err = proto .Marshal (latestIndex )
205
209
if err != nil {
206
210
return err
207
211
}
208
212
209
- protoConfig := raftConfigurationToProtoConfiguration (configurationIndex , configuration )
213
+ protoConfig := raftConfigurationToProtoConfiguration (metadata . ConfigurationIndex , metadata . Configuration )
210
214
configBytes , err := proto .Marshal (protoConfig )
211
215
if err != nil {
212
216
return err
@@ -232,16 +236,16 @@ func (f *FSM) witnessSnapshot(index, term, configurationIndex uint64, configurat
232
236
}
233
237
}
234
238
235
- atomic .StoreUint64 (f .latestIndex , index )
236
- atomic .StoreUint64 (f .latestTerm , term )
239
+ atomic .StoreUint64 (f .latestIndex , metadata . Index )
240
+ atomic .StoreUint64 (f .latestTerm , metadata . Term )
237
241
f .latestConfig .Store (protoConfig )
238
242
239
243
return nil
240
244
}
241
245
242
246
// Delete deletes the given key from the bolt file.
243
247
func (f * FSM ) Delete (ctx context.Context , path string ) error {
244
- defer metrics .MeasureSince ([]string {"raft " , "delete" }, time .Now ())
248
+ defer metrics .MeasureSince ([]string {"raft_storage" , "fsm " , "delete" }, time .Now ())
245
249
246
250
f .l .RLock ()
247
251
defer f .l .RUnlock ()
@@ -253,7 +257,7 @@ func (f *FSM) Delete(ctx context.Context, path string) error {
253
257
254
258
// Delete deletes the given key from the bolt file.
255
259
func (f * FSM ) DeletePrefix (ctx context.Context , prefix string ) error {
256
- defer metrics .MeasureSince ([]string {"raft " , "delete_prefix" }, time .Now ())
260
+ defer metrics .MeasureSince ([]string {"raft_storage" , "fsm " , "delete_prefix" }, time .Now ())
257
261
258
262
f .l .RLock ()
259
263
defer f .l .RUnlock ()
@@ -277,7 +281,9 @@ func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error {
277
281
278
282
// Get retrieves the value at the given path from the bolt file.
279
283
func (f * FSM ) Get (ctx context.Context , path string ) (* physical.Entry , error ) {
284
+ // TODO: Remove this outdated metric name in an older release
280
285
defer metrics .MeasureSince ([]string {"raft" , "get" }, time .Now ())
286
+ defer metrics .MeasureSince ([]string {"raft_storage" , "fsm" , "get" }, time .Now ())
281
287
282
288
f .l .RLock ()
283
289
defer f .l .RUnlock ()
@@ -311,7 +317,7 @@ func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) {
311
317
312
318
// Put writes the given entry to the bolt file.
313
319
func (f * FSM ) Put (ctx context.Context , entry * physical.Entry ) error {
314
- defer metrics .MeasureSince ([]string {"raft " , "put" }, time .Now ())
320
+ defer metrics .MeasureSince ([]string {"raft_storage" , "fsm " , "put" }, time .Now ())
315
321
316
322
f .l .RLock ()
317
323
defer f .l .RUnlock ()
@@ -324,7 +330,9 @@ func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error {
324
330
325
331
// List retrieves the set of keys with the given prefix from the bolt file.
326
332
func (f * FSM ) List (ctx context.Context , prefix string ) ([]string , error ) {
333
+ // TODO: Remove this outdated metric name in a future release
327
334
defer metrics .MeasureSince ([]string {"raft" , "list" }, time .Now ())
335
+ defer metrics .MeasureSince ([]string {"raft_storage" , "fsm" , "list" }, time .Now ())
328
336
329
337
f .l .RLock ()
330
338
defer f .l .RUnlock ()
@@ -531,6 +539,8 @@ type writeErrorCloser interface {
531
539
// (size, checksum, etc) and a second for the sink of the data. We also use a
532
540
// proto delimited writer so we can stream proto messages to the sink.
533
541
func (f * FSM ) writeTo (ctx context.Context , metaSink writeErrorCloser , sink writeErrorCloser ) {
542
+ defer metrics .MeasureSince ([]string {"raft_storage" , "fsm" , "write_snapshot" }, time .Now ())
543
+
534
544
protoWriter := protoio .NewDelimitedWriter (sink )
535
545
metadataProtoWriter := protoio .NewDelimitedWriter (metaSink )
536
546
@@ -573,7 +583,9 @@ func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink write
573
583
574
584
// Snapshot implements the FSM interface. It returns a noop snapshot object.
575
585
func (f * FSM ) Snapshot () (raft.FSMSnapshot , error ) {
576
- return & noopSnapshotter {}, nil
586
+ return & noopSnapshotter {
587
+ fsm : f ,
588
+ }, nil
577
589
}
578
590
579
591
// SetNoopRestore is used to disable restore operations on raft startup. Because
@@ -589,48 +601,91 @@ func (f *FSM) SetNoopRestore(enabled bool) {
589
601
// first deletes the existing bucket to clear all existing data, then recreates
590
602
// it so we can copy in the snapshot.
591
603
func (f * FSM ) Restore (r io.ReadCloser ) error {
604
+ defer metrics .MeasureSince ([]string {"raft_storage" , "fsm" , "restore_snapshot" }, time .Now ())
605
+
592
606
if f .noopRestore == true {
593
607
return nil
594
608
}
595
609
610
+ snapMeta := r .(* boltSnapshotMetadataReader ).Metadata ()
611
+
596
612
protoReader := protoio .NewDelimitedReader (r , math .MaxInt32 )
597
613
defer protoReader .Close ()
598
614
599
615
f .l .Lock ()
600
616
defer f .l .Unlock ()
601
617
602
- // Start a write transaction.
618
+ // Delete the existing data bucket and create a new one.
619
+ f .logger .Debug ("snapshot restore: deleting bucket" )
603
620
err := f .db .Update (func (tx * bolt.Tx ) error {
604
621
err := tx .DeleteBucket (dataBucketName )
605
622
if err != nil {
606
623
return err
607
624
}
608
625
609
- b , err : = tx .CreateBucket (dataBucketName )
626
+ _ , err = tx .CreateBucket (dataBucketName )
610
627
if err != nil {
611
628
return err
612
629
}
613
630
614
- for {
631
+ return nil
632
+ })
633
+ if err != nil {
634
+ f .logger .Error ("could not restore snapshot: could not clear existing bucket" , "error" , err )
635
+ return err
636
+ }
637
+
638
+ // If we are testing a failed snapshot error here.
639
+ if f .testSnapshotRestoreError {
640
+ return errors .New ("Test error" )
641
+ }
642
+
643
+ f .logger .Debug ("snapshot restore: deleting bucket done" )
644
+ f .logger .Debug ("snapshot restore: writing keys" )
645
+
646
+ var done bool
647
+ var keys int
648
+ for ! done {
649
+ err := f .db .Update (func (tx * bolt.Tx ) error {
650
+ b := tx .Bucket (dataBucketName )
615
651
s := new (pb.StorageEntry )
616
- err := protoReader .ReadMsg (s )
617
- if err != nil {
618
- if err == io .EOF {
619
- return nil
652
+
653
+ // Commit in batches of 50k. Bolt holds all the data in memory and
654
+ // doesn't split the pages until commit so we do incremental writes.
655
+ // This is safe since we have a write lock on the fsm's lock.
656
+ for i := 0 ; i < 50000 ; i ++ {
657
+ err := protoReader .ReadMsg (s )
658
+ if err != nil {
659
+ if err == io .EOF {
660
+ done = true
661
+ return nil
662
+ }
663
+ return err
620
664
}
621
- return err
622
- }
623
665
624
- err = b .Put ([]byte (s .Key ), s .Value )
625
- if err != nil {
626
- return err
666
+ err = b .Put ([]byte (s .Key ), s .Value )
667
+ if err != nil {
668
+ return err
669
+ }
670
+ keys += 1
627
671
}
672
+
673
+ return nil
674
+ })
675
+ if err != nil {
676
+ f .logger .Error ("could not restore snapshot" , "error" , err )
677
+ return err
628
678
}
629
679
630
- return nil
631
- })
632
- if err != nil {
633
- f .logger .Error ("could not restore snapshot" , "error" , err )
680
+ f .logger .Trace ("snapshot restore: writing keys" , "num_written" , keys )
681
+ }
682
+
683
+ f .logger .Debug ("snapshot restore: writing keys done" )
684
+
685
+ // Write the metadata after we have applied all the snapshot data
686
+ f .logger .Debug ("snapshot restore: writing metadata" )
687
+ if err := f .witnessSnapshot (snapMeta ); err != nil {
688
+ f .logger .Error ("could not write metadata" , "error" , err )
634
689
return err
635
690
}
636
691
@@ -639,10 +694,23 @@ func (f *FSM) Restore(r io.ReadCloser) error {
639
694
640
695
// noopSnapshotter implements the fsm.Snapshot interface. It doesn't do anything
641
696
// since our SnapshotStore reads data out of the FSM on Open().
642
- type noopSnapshotter struct {}
697
+ type noopSnapshotter struct {
698
+ fsm * FSM
699
+ }
643
700
644
- // Persist doesn't do anything.
701
+ // Persist implements the fsm.Snapshot interface. It doesn't need to persist any
702
+ // state data, but it does persist the raft metadata. This is necessary so we
703
+ // can be sure to capture indexes for operation types that are not sent to the
704
+ // FSM.
645
705
func (s * noopSnapshotter ) Persist (sink raft.SnapshotSink ) error {
706
+ boltSnapshotSink := sink .(* BoltSnapshotSink )
707
+
708
+ // We are processing a snapshot, fastforward the index, term, and
709
+ // configuration to the latest seen by the raft system.
710
+ if err := s .fsm .witnessSnapshot (& boltSnapshotSink .meta ); err != nil {
711
+ return err
712
+ }
713
+
646
714
return nil
647
715
}
648
716
0 commit comments