Skip to content

Commit 5299537

Browse files
authored
storage/raft: Fix memory allocation issue and Metadata tracking issues with snapshots (#8793)
* storage/raft: Split snapshot restore disk write into batches * Work on snapshot consistency * make sure tests send a snapshot * Fix comment * Don't remove metrics * Fix comment
1 parent 7807d45 commit 5299537

File tree

5 files changed

+255
-40
lines changed

5 files changed

+255
-40
lines changed

go.sum

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ github.com/coreos/go-oidc v2.1.0+incompatible h1:sdJrfw8akMnCuUlaZU3tE/uYXFgfqom
150150
github.com/coreos/go-oidc v2.1.0+incompatible/go.mod h1:CgnwVTmzoESiwO9qyAFEMiHoZ1nMCKZlZ9V6mm3/LKc=
151151
github.com/coreos/go-semver v0.2.0 h1:3Jm3tLmsgAYcjC+4Up7hJrFBPr+n7rAqYeSw/SZazuY=
152152
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
153+
github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7 h1:u9SHYsPQNyt5tgDm3YN7+9dYrpK96E5wFilTFWIDZOM=
153154
github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
154155
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e h1:Wf6HqHfScWJN9/ZjdUKyjop4mf3Qdd+1TvvltAvM3m8=
155156
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=

physical/raft/fsm.go

Lines changed: 96 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ type FSM struct {
8888
storeLatestState bool
8989

9090
chunker *raftchunking.ChunkingBatchingFSM
91+
92+
// testSnapshotRestoreError is used in tests to simulate an error while
93+
// restoring a snapshot.
94+
testSnapshotRestoreError bool
9195
}
9296

9397
// NewFSM constructs a FSM using the given directory
@@ -193,20 +197,20 @@ func (f *FSM) witnessIndex(i *IndexValue) {
193197
}
194198
}
195199

196-
func (f *FSM) witnessSnapshot(index, term, configurationIndex uint64, configuration raft.Configuration) error {
200+
func (f *FSM) witnessSnapshot(metadata *raft.SnapshotMeta) error {
197201
var indexBytes []byte
198202
latestIndex, _ := f.LatestState()
199203

200-
latestIndex.Index = index
201-
latestIndex.Term = term
204+
latestIndex.Index = metadata.Index
205+
latestIndex.Term = metadata.Term
202206

203207
var err error
204208
indexBytes, err = proto.Marshal(latestIndex)
205209
if err != nil {
206210
return err
207211
}
208212

209-
protoConfig := raftConfigurationToProtoConfiguration(configurationIndex, configuration)
213+
protoConfig := raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration)
210214
configBytes, err := proto.Marshal(protoConfig)
211215
if err != nil {
212216
return err
@@ -232,16 +236,16 @@ func (f *FSM) witnessSnapshot(index, term, configurationIndex uint64, configurat
232236
}
233237
}
234238

235-
atomic.StoreUint64(f.latestIndex, index)
236-
atomic.StoreUint64(f.latestTerm, term)
239+
atomic.StoreUint64(f.latestIndex, metadata.Index)
240+
atomic.StoreUint64(f.latestTerm, metadata.Term)
237241
f.latestConfig.Store(protoConfig)
238242

239243
return nil
240244
}
241245

242246
// Delete deletes the given key from the bolt file.
243247
func (f *FSM) Delete(ctx context.Context, path string) error {
244-
defer metrics.MeasureSince([]string{"raft", "delete"}, time.Now())
248+
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete"}, time.Now())
245249

246250
f.l.RLock()
247251
defer f.l.RUnlock()
@@ -253,7 +257,7 @@ func (f *FSM) Delete(ctx context.Context, path string) error {
253257

254258
// Delete deletes the given key from the bolt file.
255259
func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error {
256-
defer metrics.MeasureSince([]string{"raft", "delete_prefix"}, time.Now())
260+
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete_prefix"}, time.Now())
257261

258262
f.l.RLock()
259263
defer f.l.RUnlock()
@@ -277,7 +281,9 @@ func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error {
277281

278282
// Get retrieves the value at the given path from the bolt file.
279283
func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) {
284+
// TODO: Remove this outdated metric name in an older release
280285
defer metrics.MeasureSince([]string{"raft", "get"}, time.Now())
286+
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "get"}, time.Now())
281287

282288
f.l.RLock()
283289
defer f.l.RUnlock()
@@ -311,7 +317,7 @@ func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) {
311317

312318
// Put writes the given entry to the bolt file.
313319
func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error {
314-
defer metrics.MeasureSince([]string{"raft", "put"}, time.Now())
320+
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "put"}, time.Now())
315321

316322
f.l.RLock()
317323
defer f.l.RUnlock()
@@ -324,7 +330,9 @@ func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error {
324330

325331
// List retrieves the set of keys with the given prefix from the bolt file.
326332
func (f *FSM) List(ctx context.Context, prefix string) ([]string, error) {
333+
// TODO: Remove this outdated metric name in a future release
327334
defer metrics.MeasureSince([]string{"raft", "list"}, time.Now())
335+
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "list"}, time.Now())
328336

329337
f.l.RLock()
330338
defer f.l.RUnlock()
@@ -531,6 +539,8 @@ type writeErrorCloser interface {
531539
// (size, checksum, etc) and a second for the sink of the data. We also use a
532540
// proto delimited writer so we can stream proto messages to the sink.
533541
func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink writeErrorCloser) {
542+
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "write_snapshot"}, time.Now())
543+
534544
protoWriter := protoio.NewDelimitedWriter(sink)
535545
metadataProtoWriter := protoio.NewDelimitedWriter(metaSink)
536546

@@ -573,7 +583,9 @@ func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink write
573583

574584
// Snapshot implements the FSM interface. It returns a noop snapshot object.
575585
func (f *FSM) Snapshot() (raft.FSMSnapshot, error) {
576-
return &noopSnapshotter{}, nil
586+
return &noopSnapshotter{
587+
fsm: f,
588+
}, nil
577589
}
578590

579591
// SetNoopRestore is used to disable restore operations on raft startup. Because
@@ -589,48 +601,91 @@ func (f *FSM) SetNoopRestore(enabled bool) {
589601
// first deletes the existing bucket to clear all existing data, then recreates
590602
// it so we can copy in the snapshot.
591603
func (f *FSM) Restore(r io.ReadCloser) error {
604+
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "restore_snapshot"}, time.Now())
605+
592606
if f.noopRestore == true {
593607
return nil
594608
}
595609

610+
snapMeta := r.(*boltSnapshotMetadataReader).Metadata()
611+
596612
protoReader := protoio.NewDelimitedReader(r, math.MaxInt32)
597613
defer protoReader.Close()
598614

599615
f.l.Lock()
600616
defer f.l.Unlock()
601617

602-
// Start a write transaction.
618+
// Delete the existing data bucket and create a new one.
619+
f.logger.Debug("snapshot restore: deleting bucket")
603620
err := f.db.Update(func(tx *bolt.Tx) error {
604621
err := tx.DeleteBucket(dataBucketName)
605622
if err != nil {
606623
return err
607624
}
608625

609-
b, err := tx.CreateBucket(dataBucketName)
626+
_, err = tx.CreateBucket(dataBucketName)
610627
if err != nil {
611628
return err
612629
}
613630

614-
for {
631+
return nil
632+
})
633+
if err != nil {
634+
f.logger.Error("could not restore snapshot: could not clear existing bucket", "error", err)
635+
return err
636+
}
637+
638+
// If we are testing a failed snapshot error here.
639+
if f.testSnapshotRestoreError {
640+
return errors.New("Test error")
641+
}
642+
643+
f.logger.Debug("snapshot restore: deleting bucket done")
644+
f.logger.Debug("snapshot restore: writing keys")
645+
646+
var done bool
647+
var keys int
648+
for !done {
649+
err := f.db.Update(func(tx *bolt.Tx) error {
650+
b := tx.Bucket(dataBucketName)
615651
s := new(pb.StorageEntry)
616-
err := protoReader.ReadMsg(s)
617-
if err != nil {
618-
if err == io.EOF {
619-
return nil
652+
653+
// Commit in batches of 50k. Bolt holds all the data in memory and
654+
// doesn't split the pages until commit so we do incremental writes.
655+
// This is safe since we have a write lock on the fsm's lock.
656+
for i := 0; i < 50000; i++ {
657+
err := protoReader.ReadMsg(s)
658+
if err != nil {
659+
if err == io.EOF {
660+
done = true
661+
return nil
662+
}
663+
return err
620664
}
621-
return err
622-
}
623665

624-
err = b.Put([]byte(s.Key), s.Value)
625-
if err != nil {
626-
return err
666+
err = b.Put([]byte(s.Key), s.Value)
667+
if err != nil {
668+
return err
669+
}
670+
keys += 1
627671
}
672+
673+
return nil
674+
})
675+
if err != nil {
676+
f.logger.Error("could not restore snapshot", "error", err)
677+
return err
628678
}
629679

630-
return nil
631-
})
632-
if err != nil {
633-
f.logger.Error("could not restore snapshot", "error", err)
680+
f.logger.Trace("snapshot restore: writing keys", "num_written", keys)
681+
}
682+
683+
f.logger.Debug("snapshot restore: writing keys done")
684+
685+
// Write the metadata after we have applied all the snapshot data
686+
f.logger.Debug("snapshot restore: writing metadata")
687+
if err := f.witnessSnapshot(snapMeta); err != nil {
688+
f.logger.Error("could not write metadata", "error", err)
634689
return err
635690
}
636691

@@ -639,10 +694,23 @@ func (f *FSM) Restore(r io.ReadCloser) error {
639694

640695
// noopSnapshotter implements the fsm.Snapshot interface. It doesn't do anything
641696
// since our SnapshotStore reads data out of the FSM on Open().
642-
type noopSnapshotter struct{}
697+
type noopSnapshotter struct {
698+
fsm *FSM
699+
}
643700

644-
// Persist doesn't do anything.
701+
// Persist implements the fsm.Snapshot interface. It doesn't need to persist any
702+
// state data, but it does persist the raft metadata. This is necessary so we
703+
// can be sure to capture indexes for operation types that are not sent to the
704+
// FSM.
645705
func (s *noopSnapshotter) Persist(sink raft.SnapshotSink) error {
706+
boltSnapshotSink := sink.(*BoltSnapshotSink)
707+
708+
// We are processing a snapshot, fastforward the index, term, and
709+
// configuration to the latest seen by the raft system.
710+
if err := s.fsm.witnessSnapshot(&boltSnapshotSink.meta); err != nil {
711+
return err
712+
}
713+
646714
return nil
647715
}
648716

physical/raft/raft_test.go

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,22 +76,77 @@ func getRaftWithDir(t testing.TB, bootstrap bool, noStoreState bool, raftDir str
7676
return backend, raftDir
7777
}
7878

79+
func connectPeers(nodes ...*RaftBackend) {
80+
for _, node := range nodes {
81+
for _, peer := range nodes {
82+
if node == peer {
83+
continue
84+
}
85+
86+
node.raftTransport.(*raft.InmemTransport).Connect(raft.ServerAddress(peer.NodeID()), peer.raftTransport)
87+
peer.raftTransport.(*raft.InmemTransport).Connect(raft.ServerAddress(node.NodeID()), node.raftTransport)
88+
}
89+
}
90+
}
91+
92+
func stepDownLeader(t *testing.T, node *RaftBackend) {
93+
t.Helper()
94+
95+
if err := node.raft.LeadershipTransfer().Error(); err != nil {
96+
t.Fatal(err)
97+
}
98+
99+
timeout := time.Now().Add(time.Second * 10)
100+
for !time.Now().After(timeout) {
101+
if err := node.raft.VerifyLeader().Error(); err != nil {
102+
return
103+
}
104+
time.Sleep(100 * time.Millisecond)
105+
}
106+
107+
t.Fatal("still leader")
108+
}
109+
110+
func waitForLeader(t *testing.T, nodes ...*RaftBackend) *RaftBackend {
111+
t.Helper()
112+
timeout := time.Now().Add(time.Second * 10)
113+
for !time.Now().After(timeout) {
114+
for _, node := range nodes {
115+
if node.raft.Leader() == raft.ServerAddress(node.NodeID()) {
116+
return node
117+
}
118+
}
119+
time.Sleep(100 * time.Millisecond)
120+
}
121+
122+
t.Fatal("no leader")
123+
return nil
124+
}
125+
79126
func compareFSMs(t *testing.T, fsm1, fsm2 *FSM) {
127+
t.Helper()
128+
if err := compareFSMsWithErr(t, fsm1, fsm2); err != nil {
129+
t.Fatal(err)
130+
}
131+
}
132+
133+
func compareFSMsWithErr(t *testing.T, fsm1, fsm2 *FSM) error {
80134
t.Helper()
81135
index1, config1 := fsm1.LatestState()
82136
index2, config2 := fsm2.LatestState()
83137

84138
if !proto.Equal(index1, index2) {
85-
t.Fatalf("indexes did not match: %+v != %+v", index1, index2)
139+
return fmt.Errorf("indexes did not match: %+v != %+v", index1, index2)
86140
}
87141
if !proto.Equal(config1, config2) {
88-
t.Fatalf("configs did not match: %+v != %+v", config1, config2)
142+
return fmt.Errorf("configs did not match: %+v != %+v", config1, config2)
89143
}
90144

91-
compareDBs(t, fsm1.db, fsm2.db)
145+
return compareDBs(t, fsm1.db, fsm2.db)
92146
}
93147

94-
func compareDBs(t *testing.T, boltDB1, boltDB2 *bolt.DB) {
148+
func compareDBs(t *testing.T, boltDB1, boltDB2 *bolt.DB) error {
149+
t.Helper()
95150
db1 := make(map[string]string)
96151
db2 := make(map[string]string)
97152

@@ -135,8 +190,10 @@ func compareDBs(t *testing.T, boltDB1, boltDB2 *bolt.DB) {
135190
}
136191

137192
if diff := deep.Equal(db1, db2); diff != nil {
138-
t.Fatal(diff)
193+
return fmt.Errorf("%+v", diff)
139194
}
195+
196+
return nil
140197
}
141198

142199
func TestRaft_Backend(t *testing.T) {

physical/raft/snapshot.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,6 @@ func (f *BoltSnapshotStore) Create(version raft.SnapshotVersion, index, term uin
104104
return nil, fmt.Errorf("unsupported snapshot version %d", version)
105105
}
106106

107-
// We are processing a snapshot, fastforward the index, term, and
108-
// configuration to the latest seen by the raft system. This could include
109-
// log indexes for operation types that are never sent to the FSM.
110-
if err := f.fsm.witnessSnapshot(index, term, configurationIndex, configuration); err != nil {
111-
return nil, err
112-
}
113-
114107
// Create the sink
115108
sink := &BoltSnapshotSink{
116109
store: f,
@@ -208,6 +201,11 @@ func (f *BoltSnapshotStore) Open(id string) (*raft.SnapshotMeta, io.ReadCloser,
208201
if err != nil {
209202
return nil, nil, err
210203
}
204+
205+
readCloser = &boltSnapshotMetadataReader{
206+
meta: meta,
207+
ReadCloser: readCloser,
208+
}
211209
}
212210

213211
return meta, readCloser, nil
@@ -286,3 +284,12 @@ func (s *BoltSnapshotSink) Cancel() error {
286284

287285
return nil
288286
}
287+
288+
type boltSnapshotMetadataReader struct {
289+
io.ReadCloser
290+
meta *raft.SnapshotMeta
291+
}
292+
293+
func (r *boltSnapshotMetadataReader) Metadata() *raft.SnapshotMeta {
294+
return r.meta
295+
}

0 commit comments

Comments
 (0)