Skip to content

Commit 47a5022

Browse files
Connor1996disksing
authored andcommitted
*: new store region score function for balance (tikv#1014)
1 parent 3fa9b65 commit 47a5022

33 files changed

+654
-542
lines changed

conf/config.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ leader-schedule-limit = 4
6161
region-schedule-limit = 4
6262
replica-schedule-limit = 8
6363
merge-schedule-limit = 8
64-
tolerant-size-ratio = 2.5
64+
tolerant-size-ratio = 5
65+
high-space-reatio = 0.8
66+
low-space-ratio = 0.6
6567

6668
# customized schedulers, the format is as below
6769
# if empty, it will use balance-leader, balance-region, hot-region as default

server/api/label.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ func (h *labelsHandler) GetStores(w http.ResponseWriter, r *http.Request) {
7272
return
7373
}
7474

75-
maxDownTime := h.svr.GetScheduleConfig().MaxStoreDownTime.Duration
76-
7775
stores := cluster.GetStores()
7876
storesInfo := &StoresInfo{
7977
Stores: make([]*StoreInfo, 0, len(stores)),
@@ -87,7 +85,7 @@ func (h *labelsHandler) GetStores(w http.ResponseWriter, r *http.Request) {
8785
return
8886
}
8987

90-
storeInfo := newStoreInfo(store, maxDownTime)
88+
storeInfo := newStoreInfo(h.svr.GetScheduleConfig(), store)
9189
storesInfo.Stores = append(storesInfo.Stores, storeInfo)
9290
}
9391
storesInfo.Count = len(storesInfo.Stores)

server/api/store.go

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ const (
6666
downStateName = "Down"
6767
)
6868

69-
func newStoreInfo(store *core.StoreInfo, maxStoreDownTime time.Duration) *StoreInfo {
69+
func newStoreInfo(opt *server.ScheduleConfig, store *core.StoreInfo) *StoreInfo {
7070
s := &StoreInfo{
7171
Store: &MetaStore{
7272
Store: store.Store,
@@ -77,11 +77,11 @@ func newStoreInfo(store *core.StoreInfo, maxStoreDownTime time.Duration) *StoreI
7777
Available: typeutil.ByteSize(store.Stats.GetAvailable()),
7878
LeaderCount: store.LeaderCount,
7979
LeaderWeight: store.LeaderWeight,
80-
LeaderScore: store.LeaderScore(),
80+
LeaderScore: store.LeaderScore(0),
8181
LeaderSize: store.LeaderSize,
8282
RegionCount: store.RegionCount,
8383
RegionWeight: store.RegionWeight,
84-
RegionScore: store.RegionScore(),
84+
RegionScore: store.RegionScore(opt.HighSpaceRatio, opt.LowSpaceRatio, 0),
8585
RegionSize: store.RegionSize,
8686
SendingSnapCount: store.Stats.GetSendingSnapCount(),
8787
ReceivingSnapCount: store.Stats.GetReceivingSnapCount(),
@@ -103,7 +103,7 @@ func newStoreInfo(store *core.StoreInfo, maxStoreDownTime time.Duration) *StoreI
103103
}
104104

105105
if store.State == metapb.StoreState_Up {
106-
if store.DownTime() > maxStoreDownTime {
106+
if store.DownTime() > opt.MaxStoreDownTime.Duration {
107107
s.Store.StateName = downStateName
108108
} else if store.IsDisconnected() {
109109
s.Store.StateName = disconnectedName
@@ -137,8 +137,6 @@ func (h *storeHandler) Get(w http.ResponseWriter, r *http.Request) {
137137
return
138138
}
139139

140-
maxStoreDownTime := h.svr.GetScheduleConfig().MaxStoreDownTime.Duration
141-
142140
vars := mux.Vars(r)
143141
storeIDStr := vars["id"]
144142
storeID, err := strconv.ParseUint(storeIDStr, 10, 64)
@@ -153,7 +151,7 @@ func (h *storeHandler) Get(w http.ResponseWriter, r *http.Request) {
153151
return
154152
}
155153

156-
storeInfo := newStoreInfo(store, maxStoreDownTime)
154+
storeInfo := newStoreInfo(h.svr.GetScheduleConfig(), store)
157155
h.rd.JSON(w, http.StatusOK, storeInfo)
158156
}
159157

@@ -324,8 +322,6 @@ func (h *storesHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
324322
return
325323
}
326324

327-
maxStoreDownTime := h.svr.GetScheduleConfig().MaxStoreDownTime.Duration
328-
329325
stores := cluster.GetStores()
330326
StoresInfo := &StoresInfo{
331327
Stores: make([]*StoreInfo, 0, len(stores)),
@@ -345,7 +341,7 @@ func (h *storesHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
345341
return
346342
}
347343

348-
storeInfo := newStoreInfo(store, maxStoreDownTime)
344+
storeInfo := newStoreInfo(h.svr.GetScheduleConfig(), store)
349345
StoresInfo.Stores = append(StoresInfo.Stores, storeInfo)
350346
}
351347
StoresInfo.Count = len(StoresInfo.Stores)

server/api/store_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,14 +270,14 @@ func (s *testStoreSuite) TestDownState(c *C) {
270270
Stats: &pdpb.StoreStats{},
271271
LastHeartbeatTS: time.Now(),
272272
}
273-
storeInfo := newStoreInfo(store, time.Hour)
273+
storeInfo := newStoreInfo(s.svr.GetScheduleConfig(), store)
274274
c.Assert(storeInfo.Store.StateName, Equals, metapb.StoreState_Up.String())
275275

276276
store.LastHeartbeatTS = time.Now().Add(-time.Minute * 2)
277-
storeInfo = newStoreInfo(store, time.Hour)
277+
storeInfo = newStoreInfo(s.svr.GetScheduleConfig(), store)
278278
c.Assert(storeInfo.Store.StateName, Equals, disconnectedName)
279279

280280
store.LastHeartbeatTS = time.Now().Add(-time.Hour * 2)
281-
storeInfo = newStoreInfo(store, time.Hour)
281+
storeInfo = newStoreInfo(s.svr.GetScheduleConfig(), store)
282282
c.Assert(storeInfo.Store.StateName, Equals, downStateName)
283283
}

server/api/trend.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,6 @@ func (h *trendHandler) Handle(w http.ResponseWriter, r *http.Request) {
107107
}
108108

109109
func (h *trendHandler) getTrendStores() ([]trendStore, error) {
110-
maxStoreDownTime := h.svr.GetScheduleConfig().MaxStoreDownTime.Duration
111-
112110
var readStats, writeStats core.StoreHotRegionsStat
113111
if hotRead := h.GetHotReadRegions(); hotRead != nil {
114112
readStats = hotRead.AsLeader
@@ -123,7 +121,7 @@ func (h *trendHandler) getTrendStores() ([]trendStore, error) {
123121

124122
trendStores := make([]trendStore, 0, len(stores))
125123
for _, store := range stores {
126-
info := newStoreInfo(store, maxStoreDownTime)
124+
info := newStoreInfo(h.svr.GetScheduleConfig(), store)
127125
s := trendStore{
128126
ID: info.Store.GetId(),
129127
Address: info.Store.GetAddress(),

server/cache.go

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -167,28 +167,6 @@ func (c *clusterInfo) GetStores() []*core.StoreInfo {
167167
return c.BasicCluster.GetStores()
168168
}
169169

170-
// GetStoresAverageScore returns the total resource score of all unfiltered stores.
171-
func (c *clusterInfo) GetStoresAverageScore(kind core.ResourceKind, filters ...schedule.Filter) float64 {
172-
c.RLock()
173-
defer c.RUnlock()
174-
175-
var totalResourceSize int64
176-
var totalResourceWeight float64
177-
for _, s := range c.BasicCluster.GetStores() {
178-
if schedule.FilterSource(c, s, filters) {
179-
continue
180-
}
181-
182-
totalResourceWeight += s.ResourceWeight(kind)
183-
totalResourceSize += s.ResourceSize(kind)
184-
}
185-
186-
if totalResourceWeight == 0 {
187-
return 0
188-
}
189-
return float64(totalResourceSize) / totalResourceWeight
190-
}
191-
192170
func (c *clusterInfo) getMetaStores() []*metapb.Store {
193171
c.RLock()
194172
defer c.RUnlock()
@@ -564,6 +542,14 @@ func (c *clusterInfo) GetTolerantSizeRatio() float64 {
564542
return c.opt.GetTolerantSizeRatio()
565543
}
566544

545+
func (c *clusterInfo) GetLowSpaceRatio() float64 {
546+
return c.opt.GetLowSpaceRatio()
547+
}
548+
549+
func (c *clusterInfo) GetHighSpaceRatio() float64 {
550+
return c.opt.GetHighSpaceRatio()
551+
}
552+
567553
func (c *clusterInfo) GetMaxSnapshotCount() uint64 {
568554
return c.opt.GetMaxSnapshotCount()
569555
}

server/config.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,18 @@ type ScheduleConfig struct {
352352
MergeScheduleLimit uint64 `toml:"merge-schedule-limit,omitempty" json:"merge-schedule-limit"`
353353
// TolerantSizeRatio is the ratio of buffer size for balance scheduler.
354354
TolerantSizeRatio float64 `toml:"tolerant-size-ratio,omitempty" json:"tolerant-size-ratio"`
355+
//
356+
// high space stage transition stage low space stage
357+
// |--------------------|-----------------------------|-------------------------|
358+
// ^ ^ ^ ^
359+
// 0 HighSpaceRatio * capacity LowSpaceRatio * capacity capacity
360+
//
361+
// LowSpaceRatio is the lowest usage ratio of store which regraded as low space.
362+
// When in low space, store region score increases to very large and varies inversely with available size.
363+
LowSpaceRatio float64 `toml:"low-space-ratio,omitempty" json:"low-space-ratio"`
364+
// HighSpaceRatio is the highest usage ratio of store which regraded as high space.
365+
// High space means there is a lot of spare capacity, and store region score varies directly with used size.
366+
HighSpaceRatio float64 `toml:"high-space-ratio,omitempty" json:"high-space-ratio"`
355367
// EnableRaftLearner is the option for using AddLearnerNode instead of AddNode
356368
EnableRaftLearner bool `toml:"enable-raft-learner" json:"enable-raft-learner,string"`
357369
// Schedulers support for loding customized schedulers
@@ -371,6 +383,8 @@ func (c *ScheduleConfig) clone() *ScheduleConfig {
371383
ReplicaScheduleLimit: c.ReplicaScheduleLimit,
372384
MergeScheduleLimit: c.MergeScheduleLimit,
373385
TolerantSizeRatio: c.TolerantSizeRatio,
386+
LowSpaceRatio: c.LowSpaceRatio,
387+
HighSpaceRatio: c.HighSpaceRatio,
374388
EnableRaftLearner: c.EnableRaftLearner,
375389
Schedulers: schedulers,
376390
}
@@ -396,7 +410,9 @@ const (
396410
defaultRegionScheduleLimit = 4
397411
defaultReplicaScheduleLimit = 8
398412
defaultMergeScheduleLimit = 8
399-
defaultTolerantSizeRatio = 2.5
413+
defaultTolerantSizeRatio = 5
414+
defaultLowSpaceRatio = 0.8
415+
defaultHighSpaceRatio = 0.6
400416
)
401417

402418
var defaultSchedulers = SchedulerConfigs{
@@ -416,6 +432,8 @@ func (c *ScheduleConfig) adjust() {
416432
adjustUint64(&c.ReplicaScheduleLimit, defaultReplicaScheduleLimit)
417433
adjustUint64(&c.MergeScheduleLimit, defaultMergeScheduleLimit)
418434
adjustFloat64(&c.TolerantSizeRatio, defaultTolerantSizeRatio)
435+
adjustFloat64(&c.LowSpaceRatio, defaultLowSpaceRatio)
436+
adjustFloat64(&c.HighSpaceRatio, defaultHighSpaceRatio)
419437
adjustSchedulers(&c.Schedulers, defaultSchedulers)
420438
}
421439

server/coordinator.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,7 @@ func (c *coordinator) addOperatorLocked(op *schedule.Operator) bool {
436436
if old, ok := c.operators[regionID]; ok {
437437
if !isHigherPriorityOperator(op, old) {
438438
log.Infof("[region %v] cancel add operator, old: %s", regionID, old)
439+
operatorCounter.WithLabelValues(op.Desc(), "canceled").Inc()
439440
return false
440441
}
441442
log.Infof("[region %v] replace old operator: %s", regionID, old)
@@ -470,6 +471,7 @@ func (c *coordinator) addOperators(ops ...*schedule.Operator) bool {
470471
for _, op := range ops {
471472
if old := c.operators[op.RegionID()]; old != nil && !isHigherPriorityOperator(op, old) {
472473
log.Infof("[region %v] cancel add operators, old: %s", op.RegionID(), old)
474+
operatorCounter.WithLabelValues(op.Desc(), "canceled").Inc()
473475
return false
474476
}
475477
}

server/coordinator_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ func (c *testClusterInfo) addRegionStore(storeID uint64, regionCount int) {
6565
store.LastHeartbeatTS = time.Now()
6666
store.RegionCount = regionCount
6767
store.RegionSize = int64(regionCount) * 10
68-
store.Stats.Capacity = uint64(1024)
69-
store.Stats.Available = store.Stats.Capacity
68+
store.Stats.Capacity = 1000 * (1 << 20)
69+
store.Stats.Available = store.Stats.Capacity - uint64(store.RegionSize)
7070
c.putStore(store)
7171
}
7272

server/core/store.go

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ type StoreInfo struct {
4545
func NewStoreInfo(store *metapb.Store) *StoreInfo {
4646
return &StoreInfo{
4747
Store: store,
48+
Stats: &pdpb.StoreStats{},
4849
LeaderWeight: 1.0,
4950
RegionWeight: 1.0,
5051
}
@@ -103,15 +104,46 @@ func (s *StoreInfo) DownTime() time.Duration {
103104
}
104105

105106
const minWeight = 1e-6
107+
const maxScore = 1024 * 1024 * 1024
106108

107-
// LeaderScore returns the store's leader score: leaderCount / leaderWeight.
108-
func (s *StoreInfo) LeaderScore() float64 {
109-
return float64(s.LeaderSize) / math.Max(s.LeaderWeight, minWeight)
109+
// LeaderScore returns the store's leader score: leaderSize / leaderWeight.
110+
func (s *StoreInfo) LeaderScore(delta int64) float64 {
111+
return float64(s.LeaderSize+delta) / math.Max(s.LeaderWeight, minWeight)
110112
}
111113

112-
// RegionScore returns the store's region score: regionSize / regionWeight.
113-
func (s *StoreInfo) RegionScore() float64 {
114-
return float64(s.RegionSize) / math.Max(s.RegionWeight, minWeight)
114+
// RegionScore returns the store's region score.
115+
func (s *StoreInfo) RegionScore(highSpaceRatio, lowSpaceRatio float64, delta int64) float64 {
116+
if s.RegionSize == 0 {
117+
return float64(delta)
118+
}
119+
120+
capacity := float64(s.Stats.GetCapacity()) / (1 << 20)
121+
available := float64(s.Stats.GetAvailable()) / (1 << 20)
122+
123+
var score float64
124+
125+
// because of rocksdb compression, region size is larger than actual used size
126+
amplification := float64(s.RegionSize) / (float64(s.Stats.GetUsedSize()) / (1 << 20))
127+
128+
if available-float64(delta)/amplification >= (1-highSpaceRatio)*capacity {
129+
score = float64(s.RegionSize + delta)
130+
} else if available-float64(delta)/amplification <= (1-lowSpaceRatio)*capacity {
131+
score = maxScore - (available - float64(delta)/amplification)
132+
} else {
133+
// to make the score function continuous, we use linear function y = k * x + b as transition period
134+
// from above we know that there are two points must on the function image
135+
// p1(highSpaceRatio*capacity*amplification, highSpaceRatio*capacity*amplification) and
136+
// p2(lowSpaceRatio*capacity*amplification, maxScore-(1-lowSpaceRatio)*capacity)
137+
// so k = (y2 - y1) / (x2 - x1)
138+
x1, y1 := highSpaceRatio*capacity*amplification, highSpaceRatio*capacity*amplification
139+
x2, y2 := lowSpaceRatio*capacity*amplification, maxScore-(1-lowSpaceRatio)*capacity
140+
141+
k := (y2 - y1) / (x2 - x1)
142+
b := y1 - k*x1
143+
score = k*float64(s.RegionSize+delta) + b
144+
}
145+
146+
return score / math.Max(s.RegionWeight, minWeight)
115147
}
116148

117149
// StorageSize returns store's used storage size reported from tikv.
@@ -127,11 +159,9 @@ func (s *StoreInfo) AvailableRatio() float64 {
127159
return float64(s.Stats.GetAvailable()) / float64(s.Stats.GetCapacity())
128160
}
129161

130-
const storeLowSpaceThreshold = 0.2
131-
132162
// IsLowSpace checks if the store is lack of space.
133-
func (s *StoreInfo) IsLowSpace() bool {
134-
return s.Stats != nil && s.AvailableRatio() < storeLowSpaceThreshold
163+
func (s *StoreInfo) IsLowSpace(lowSpaceRatio float64) bool {
164+
return s.Stats != nil && s.AvailableRatio() < 1-lowSpaceRatio
135165
}
136166

137167
// ResourceCount reutrns count of leader/region in the store.
@@ -159,12 +189,12 @@ func (s *StoreInfo) ResourceSize(kind ResourceKind) int64 {
159189
}
160190

161191
// ResourceScore reutrns score of leader/region in the store.
162-
func (s *StoreInfo) ResourceScore(kind ResourceKind) float64 {
192+
func (s *StoreInfo) ResourceScore(kind ResourceKind, highSpaceRatio, lowSpaceRatio float64, delta int64) float64 {
163193
switch kind {
164194
case LeaderKind:
165-
return s.LeaderScore()
195+
return s.LeaderScore(delta)
166196
case RegionKind:
167-
return s.RegionScore()
197+
return s.RegionScore(highSpaceRatio, lowSpaceRatio, delta)
168198
default:
169199
return 0
170200
}

0 commit comments

Comments
 (0)