@@ -60,8 +60,6 @@ type node struct {
6060 gid uint32
6161 closer * y.Closer
6262
63- lastCommitTs uint64 // Only used to ensure that our commit Ts is monotonically increasing.
64-
6563 streaming int32 // Used to avoid calculating snapshot
6664
6765 canCampaign bool
@@ -488,12 +486,7 @@ func (n *node) commitOrAbort(pkey string, delta *pb.OracleDelta) error {
488486 }
489487
490488 for _ , status := range delta .Txns {
491- if status .CommitTs > 0 && status .CommitTs < n .lastCommitTs {
492- glog .Errorf ("Lastcommit %d > current %d. This would cause some commits to be lost." ,
493- n .lastCommitTs , status .CommitTs )
494- }
495489 toDisk (status .StartTs , status .CommitTs )
496- n .lastCommitTs = status .CommitTs
497490 }
498491 if err := writer .Flush (); err != nil {
499492 return x .Errorf ("Error while flushing to disk: %v" , err )
@@ -623,6 +616,66 @@ func (n *node) rampMeter() {
623616 time .Sleep (3 * time .Millisecond )
624617 }
625618}
619+
620+ func (n * node ) findRaftProgress () (uint64 , error ) {
621+ var applied uint64
622+ err := pstore .View (func (txn * badger.Txn ) error {
623+ item , err := txn .Get (x .RaftKey ())
624+ if err == badger .ErrKeyNotFound {
625+ return nil
626+ }
627+ if err != nil {
628+ return err
629+ }
630+ return item .Value (func (val []byte ) error {
631+ var snap pb.Snapshot
632+ if err := snap .Unmarshal (val ); err != nil {
633+ return err
634+ }
635+ applied = snap .Index
636+ return nil
637+ })
638+ })
639+ return applied , err
640+ }
641+
642+ func (n * node ) updateRaftProgress () error {
643+ // Both leader and followers can independently update their Raft progress. We don't store
644+ // this in Raft WAL. Instead, this is used to just skip over log records that this Alpha
645+ // has already applied, to speed up things on a restart.
646+ snap , err := n .calculateSnapshot (10 ) // 10 is a randomly chosen small number.
647+ if err != nil {
648+ return err
649+ }
650+ if snap == nil {
651+ return nil
652+ }
653+
654+ // Let's check what we already have. And only update if the new snap.Index is ahead of the last
655+ // stored applied.
656+ applied , err := n .findRaftProgress ()
657+ if err != nil {
658+ return err
659+ }
660+ if snap .Index <= applied {
661+ return nil
662+ }
663+
664+ data , err := snap .Marshal ()
665+ x .Check (err )
666+ txn := pstore .NewTransactionAt (math .MaxUint64 , true )
667+ defer txn .Discard ()
668+
669+ if err := txn .Set (x .RaftKey (), data ); err != nil {
670+ return err
671+ }
672+ if err := txn .CommitAt (1 , nil ); err != nil {
673+ return err
674+ }
675+ glog .V (1 ).Infof ("[%#x] Set Raft progress to index: %d." , n .Id , snap .Index )
676+ return nil
677+ }
678+
626679func (n * node ) Run () {
627680 defer n .closer .Done () // CLOSER:1
628681
@@ -647,7 +700,13 @@ func (n *node) Run() {
647700 close (done )
648701 }()
649702
650- var snapshotLoops uint64
703+ applied , err := n .findRaftProgress ()
704+ if err != nil {
705+ glog .Errorf ("While trying to find raft progress: %v" , err )
706+ } else {
707+ glog .Infof ("Found Raft progress in p directory: %d" , applied )
708+ }
709+
651710 for {
652711 select {
653712 case <- done :
@@ -660,23 +719,23 @@ func (n *node) Run() {
660719
661720 case <- slowTicker .C :
662721 n .elog .Printf ("Size of applyCh: %d" , len (n .applyCh ))
722+ if err := n .updateRaftProgress (); err != nil {
723+ glog .Errorf ("While updating Raft progress: %v" , err )
724+ }
725+
663726 if leader {
664- // We try to take a snapshot every slow tick duration, with a 1000 discard entries.
665- // But, once a while, we take a snapshot with 10 discard entries. This avoids the
666- // scenario where after bringing up an Alpha, and doing a hundred schema updates, we
667- // don't take any snapshots because there are not enough updates (discardN=10),
668- // which then really slows down restarts. At the same time, by checking more
669- // frequently, we can quickly take a snapshot if a lot of mutations are coming in
670- // fast (discardN=1000).
671- discardN := 1000
672- if snapshotLoops % 5 == 0 {
673- discardN = 10
674- }
675- snapshotLoops ++
727+ // We keep track of the applied index in the p directory. Even if we don't take
728+ // snapshot for a while and let the Raft logs grow and restart, we would not have to
729+ // run all the log entries, because we can tell Raft.Config to set Applied to that
730+ // index.
731+ // This applied index tracking also covers the case when we have a big index
732+ // rebuild. The rebuild would be tracked just like others and would not need to be
733+ // replayed after a restart, because the Applied config would let us skip right
734+ // through it.
676735 // We use disk based storage for Raft. So, we're not too concerned about
677736 // snapshotting. We just need to do enough, so that we don't have a huge backlog of
678737 // entries to process on a restart.
679- if err := n .proposeSnapshot (discardN ); err != nil {
738+ if err := n .proposeSnapshot (Config . SnapshotAfter ); err != nil {
680739 x .Errorf ("While calculating and proposing snapshot: %v" , err )
681740 }
682741 go n .abortOldTransactions ()
@@ -782,6 +841,10 @@ func (n *node) Run() {
782841 n .elog .Printf ("Found empty data at index: %d" , entry .Index )
783842 n .Applied .Done (entry .Index )
784843
844+ } else if entry .Index < applied {
845+ n .elog .Printf ("Skipping over already applied entry: %d" , entry .Index )
846+ n .Applied .Done (entry .Index )
847+
785848 } else {
786849 proposal := & pb.Proposal {}
787850 if err := proposal .Unmarshal (entry .Data ); err != nil {
@@ -971,7 +1034,6 @@ func (n *node) blockingAbort(req *pb.TxnTimestamps) error {
9711034
9721035 // Let's propose the txn updates received from Zero. This is important because there are edge
9731036 // cases where a txn status might have been missed by the group.
974- glog .Infof ("TryAbort returned with delta: %+v\n " , delta )
9751037 aborted := & pb.OracleDelta {}
9761038 for _ , txn := range delta .Txns {
9771039 // Only pick the aborts. DO NOT propose the commits. They must come in the right order via
@@ -1000,14 +1062,14 @@ func (n *node) blockingAbort(req *pb.TxnTimestamps) error {
10001062// abort. Note that only the leader runs this function.
10011063func (n * node ) abortOldTransactions () {
10021064 // Aborts if not already committed.
1003- starts := posting .Oracle ().TxnOlderThan (5 * time . Minute )
1065+ starts := posting .Oracle ().TxnOlderThan (Config . AbortOlderThan )
10041066 if len (starts ) == 0 {
10051067 return
10061068 }
10071069 glog .Infof ("Found %d old transactions. Acting to abort them.\n " , len (starts ))
10081070 req := & pb.TxnTimestamps {Ts : starts }
10091071 err := n .blockingAbort (req )
1010- glog .Infof ("abortOldTransactions for %d txns. Error: %+v\n " , len (req .Ts ), err )
1072+ glog .Infof ("Done abortOldTransactions for %d txns. Error: %+v\n " , len (req .Ts ), err )
10111073}
10121074
10131075// calculateSnapshot would calculate a snapshot index, considering these factors:
0 commit comments