Skip to content

Commit eba6e5a

Browse files
authored
feat(metrics): Add Raft leadership metrics. (#7338)
* dgraph_raft_has_leader * dgraph_raft_is_leader * dgraph_raft_leader_changes_total
1 parent fde7e64 commit eba6e5a

4 files changed

Lines changed: 115 additions & 30 deletions

File tree

dgraph/cmd/alpha/metrics_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ func TestMetrics(t *testing.T) {
115115
"dgraph_active_mutations_total", "dgraph_pending_proposals_total",
116116
"dgraph_pending_queries_total",
117117
"dgraph_num_queries_total", "dgraph_alpha_health_status",
118+
119+
// Raft metrics
120+
"dgraph_raft_has_leader", "dgraph_raft_is_leader", "dgraph_raft_leader_changes_total",
118121
}
119122
for _, requiredM := range requiredMetrics {
120123
_, ok := metricsMap[requiredM]

dgraph/cmd/zero/raft.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ import (
2727
"sync"
2828
"time"
2929

30-
otrace "go.opencensus.io/trace"
31-
3230
"github.com/dgraph-io/dgraph/conn"
3331
"github.com/dgraph-io/dgraph/protos/pb"
3432
"github.com/dgraph-io/dgraph/x"
@@ -39,6 +37,9 @@ import (
3937
"github.com/pkg/errors"
4038
"go.etcd.io/etcd/raft"
4139
"go.etcd.io/etcd/raft/raftpb"
40+
ostats "go.opencensus.io/stats"
41+
"go.opencensus.io/tag"
42+
otrace "go.opencensus.io/trace"
4243
)
4344

4445
var raftDefault = "idx=1; learner=false"
@@ -809,6 +810,12 @@ func (n *node) calculateAndProposeSnapshot() error {
809810
const tickDur = 100 * time.Millisecond
810811

811812
func (n *node) Run() {
813+
// lastLead is for detecting leadership changes
814+
//
815+
// etcd has a similar mechanism for tracking leader changes, with their
816+
// raftReadyHandler.getLead() function that returns the previous leader
817+
lastLead := uint64(math.MaxUint64)
818+
812819
var leader bool
813820
licenseApplied := false
814821
ticker := time.NewTicker(tickDur)
@@ -861,6 +868,22 @@ func (n *node) Run() {
861868
n.server.updateLeases()
862869
}
863870
leader = rd.RaftState == raft.StateLeader
871+
// group id hardcoded as 0
872+
ctx, _ := tag.New(n.ctx, tag.Upsert(x.KeyGroup, "0"))
873+
if rd.SoftState.Lead != lastLead {
874+
lastLead = rd.SoftState.Lead
875+
ostats.Record(ctx, x.RaftLeaderChanges.M(1))
876+
}
877+
if rd.SoftState.Lead != raft.None {
878+
ostats.Record(ctx, x.RaftHasLeader.M(1))
879+
} else {
880+
ostats.Record(ctx, x.RaftHasLeader.M(0))
881+
}
882+
if leader {
883+
ostats.Record(ctx, x.RaftIsLeader.M(1))
884+
} else {
885+
ostats.Record(ctx, x.RaftIsLeader.M(0))
886+
}
864887
// Oracle stream would close the stream once it steps down as leader
865888
// predicate move would cancel any in progress move on stepping down.
866889
n.triggerLeaderChange()

worker/draft.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"encoding/binary"
2323
"encoding/hex"
2424
"fmt"
25+
"math"
2526
"sort"
2627
"sync"
2728
"sync/atomic"
@@ -1052,6 +1053,12 @@ const tickDur = 100 * time.Millisecond
10521053
func (n *node) Run() {
10531054
defer n.closer.Done() // CLOSER:1
10541055

1056+
// lastLead is for detecting leadership changes
1057+
//
1058+
// etcd has a similar mechanism for tracking leader changes, with their
1059+
// raftReadyHandler.getLead() function that returns the previous leader
1060+
lastLead := uint64(math.MaxUint64)
1061+
10551062
firstRun := true
10561063
var leader bool
10571064
// See also our configuration of HeartbeatTick and ElectionTick.
@@ -1106,6 +1113,23 @@ func (n *node) Run() {
11061113
if rd.SoftState != nil {
11071114
groups().triggerMembershipSync()
11081115
leader = rd.RaftState == raft.StateLeader
1116+
// create context with group id
1117+
ctx, _ := tag.New(n.ctx, tag.Upsert(x.KeyGroup, fmt.Sprintf("%d", n.gid)))
1118+
// detect leadership changes
1119+
if rd.SoftState.Lead != lastLead {
1120+
lastLead = rd.SoftState.Lead
1121+
ostats.Record(ctx, x.RaftLeaderChanges.M(1))
1122+
}
1123+
if rd.SoftState.Lead != raft.None {
1124+
ostats.Record(ctx, x.RaftHasLeader.M(1))
1125+
} else {
1126+
ostats.Record(ctx, x.RaftHasLeader.M(0))
1127+
}
1128+
if leader {
1129+
ostats.Record(ctx, x.RaftIsLeader.M(1))
1130+
} else {
1131+
ostats.Record(ctx, x.RaftIsLeader.M(0))
1132+
}
11091133
}
11101134
if leader {
11111135
// Leader can send messages in parallel with writing to disk.

x/metrics.go

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,25 @@ var (
111111
// PLCacheHitRatio records the hit ratio of posting list cache.
112112
PLCacheHitRatio = stats.Float64("hit_ratio_posting_cache",
113113
"Hit ratio of posting list cache", stats.UnitDimensionless)
114+
// RaftHasLeader records whether this instance has a leader
115+
RaftHasLeader = stats.Int64("raft_has_leader",
116+
"Whether or not a leader exists for the group", stats.UnitDimensionless)
117+
// RaftIsLeader records whether this instance is the leader
118+
RaftIsLeader = stats.Int64("raft_is_leader",
119+
"Whether or not this instance is the leader of the group", stats.UnitDimensionless)
120+
// RaftLeaderChanges records the total number of leader changes seen.
121+
RaftLeaderChanges = stats.Int64("raft_leader_changes_total",
122+
"Total number of leader changes seen", stats.UnitDimensionless)
114123

115124
// Conf holds the metrics config.
116125
// TODO: Request statistics, latencies, 500, timeouts
117126
Conf *expvar.Map
118127

119128
// Tag keys.
120129

130+
// KeyGroup is the tag key used to record the group for Raft metrics.
131+
KeyGroup, _ = tag.NewKey("group")
132+
121133
// KeyStatus is the tag key used to record the status of the server.
122134
KeyStatus, _ = tag.NewKey("status")
123135
// KeyMethod is the tag key used to record the method (e.g read or mutate).
@@ -141,6 +153,8 @@ var (
141153
KeyStatus, KeyMethod,
142154
}
143155

156+
allRaftKeys = []tag.Key{KeyGroup}
157+
144158
allViews = []*view.View{
145159
{
146160
Name: LatencyMs.Name(),
@@ -163,34 +177,6 @@ var (
163177
Aggregation: view.Count(),
164178
TagKeys: allTagKeys,
165179
},
166-
{
167-
Name: RaftAppliedIndex.Name(),
168-
Measure: RaftAppliedIndex,
169-
Description: RaftAppliedIndex.Description(),
170-
Aggregation: view.LastValue(),
171-
TagKeys: allTagKeys,
172-
},
173-
{
174-
Name: RaftApplyCh.Name(),
175-
Measure: RaftApplyCh,
176-
Description: RaftApplyCh.Description(),
177-
Aggregation: view.LastValue(),
178-
TagKeys: allTagKeys,
179-
},
180-
{
181-
Name: RaftPendingSize.Name(),
182-
Measure: RaftPendingSize,
183-
Description: RaftPendingSize.Description(),
184-
Aggregation: view.LastValue(),
185-
TagKeys: allTagKeys,
186-
},
187-
{
188-
Name: MaxAssignedTs.Name(),
189-
Measure: MaxAssignedTs,
190-
Description: MaxAssignedTs.Description(),
191-
Aggregation: view.LastValue(),
192-
TagKeys: allTagKeys,
193-
},
194180
{
195181
Name: TxnAborts.Name(),
196182
Measure: TxnAborts,
@@ -277,6 +263,55 @@ var (
277263
Aggregation: view.LastValue(),
278264
TagKeys: allTagKeys,
279265
},
266+
{
267+
Name: RaftAppliedIndex.Name(),
268+
Measure: RaftAppliedIndex,
269+
Description: RaftAppliedIndex.Description(),
270+
Aggregation: view.LastValue(),
271+
TagKeys: allRaftKeys,
272+
},
273+
{
274+
Name: RaftApplyCh.Name(),
275+
Measure: RaftApplyCh,
276+
Description: RaftApplyCh.Description(),
277+
Aggregation: view.LastValue(),
278+
TagKeys: allRaftKeys,
279+
},
280+
{
281+
Name: RaftPendingSize.Name(),
282+
Measure: RaftPendingSize,
283+
Description: RaftPendingSize.Description(),
284+
Aggregation: view.LastValue(),
285+
TagKeys: allRaftKeys,
286+
},
287+
{
288+
Name: RaftHasLeader.Name(),
289+
Measure: RaftHasLeader,
290+
Description: RaftHasLeader.Description(),
291+
Aggregation: view.LastValue(),
292+
TagKeys: allRaftKeys,
293+
},
294+
{
295+
Name: RaftIsLeader.Name(),
296+
Measure: RaftIsLeader,
297+
Description: RaftIsLeader.Description(),
298+
Aggregation: view.LastValue(),
299+
TagKeys: allRaftKeys,
300+
},
301+
{
302+
Name: RaftLeaderChanges.Name(),
303+
Measure: RaftLeaderChanges,
304+
Description: RaftLeaderChanges.Description(),
305+
Aggregation: view.Count(),
306+
TagKeys: allRaftKeys,
307+
},
308+
{
309+
Name: MaxAssignedTs.Name(),
310+
Measure: MaxAssignedTs,
311+
Description: MaxAssignedTs.Description(),
312+
Aggregation: view.LastValue(),
313+
TagKeys: allTagKeys,
314+
},
280315
}
281316
)
282317

0 commit comments

Comments
 (0)