Skip to content

Commit a7c3dd0

Browse files
authored
[stats] costed index scan perf (#2421)
* [stats] costed index scan perf * generalize stats interfaces * zach comments
1 parent 947e7c3 commit a7c3dd0

File tree

9 files changed

+343
-216
lines changed

9 files changed

+343
-216
lines changed

enginetest/queries/stats_queries.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ var StatisticsQueries = []ScriptTest{
3535
SkipResultCheckOnServerEngine: true, // the non-interface types are not identified over the wire result
3636
Query: "SELECT * FROM information_schema.column_statistics",
3737
Expected: []sql.Row{
38-
{"mydb", "t", "i", stats.NewStatistic(3, 3, 0, 24, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Int64}, []*stats.Bucket{
38+
{"mydb", "t", "i", stats.NewStatistic(3, 3, 0, 24, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Int64}, []sql.HistogramBucket{
3939
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(1)}, nil, nil),
4040
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(2)}, nil, nil),
4141
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(3)}, nil, nil),
@@ -60,7 +60,7 @@ var StatisticsQueries = []ScriptTest{
6060
SkipResultCheckOnServerEngine: true, // the non-interface types are not identified over the wire result
6161
Query: "SELECT * FROM information_schema.column_statistics",
6262
Expected: []sql.Row{
63-
{"mydb", "t", "i", stats.NewStatistic(40, 40, 1, 0, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Int64}, []*stats.Bucket{
63+
{"mydb", "t", "i", stats.NewStatistic(40, 40, 1, 0, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Int64}, []sql.HistogramBucket{
6464
stats.NewHistogramBucket(20, 20, 0, 1, sql.Row{float64(50)}, nil, nil),
6565
stats.NewHistogramBucket(20, 20, 0, 1, sql.Row{float64(80)}, nil, nil),
6666
}, sql.IndexClassDefault, nil),
@@ -89,13 +89,13 @@ var StatisticsQueries = []ScriptTest{
8989
SkipResultCheckOnServerEngine: true, // the non-interface types are not identified over the wire result
9090
Query: "SELECT * FROM information_schema.column_statistics",
9191
Expected: []sql.Row{
92-
{"mydb", "t", "i", stats.NewStatistic(3, 3, 0, 48, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Int64}, []*stats.Bucket{
92+
{"mydb", "t", "i", stats.NewStatistic(3, 3, 0, 48, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Int64}, []sql.HistogramBucket{
9393
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(1)}, nil, []sql.Row{}),
9494
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(2)}, nil, []sql.Row{}),
9595
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(3)}, nil, []sql.Row{}),
9696
}, sql.IndexClassDefault, nil),
9797
},
98-
{"mydb", "t", "j", stats.NewStatistic(3, 3, 0, 48, time.Now(), sql.NewStatQualifier("mydb", "t", "j"), []string{"j"}, []sql.Type{types.Int64}, []*stats.Bucket{
98+
{"mydb", "t", "j", stats.NewStatistic(3, 3, 0, 48, time.Now(), sql.NewStatQualifier("mydb", "t", "j"), []string{"j"}, []sql.Type{types.Int64}, []sql.HistogramBucket{
9999
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(4)}, nil, []sql.Row{}),
100100
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(5)}, nil, []sql.Row{}),
101101
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{int64(6)}, nil, []sql.Row{}),
@@ -117,7 +117,7 @@ var StatisticsQueries = []ScriptTest{
117117
SkipResultCheckOnServerEngine: true, // the non-interface types are not identified over the wire result
118118
Query: "SELECT * FROM information_schema.column_statistics",
119119
Expected: []sql.Row{
120-
{"mydb", "t", "i", stats.NewStatistic(4, 4, 0, 32, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Float64}, []*stats.Bucket{
120+
{"mydb", "t", "i", stats.NewStatistic(4, 4, 0, 32, time.Now(), sql.NewStatQualifier("mydb", "t", "primary"), []string{"i"}, []sql.Type{types.Float64}, []sql.HistogramBucket{
121121
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{float64(1.25)}, nil, []sql.Row{}),
122122
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{float64(7.5)}, nil, []sql.Row{}),
123123
stats.NewHistogramBucket(1, 1, 0, 1, sql.Row{float64(10.5)}, nil, []sql.Row{}),

memory/stats.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ func (s *StatsProv) estimateStats(ctx *sql.Context, table sql.Table, keys map[st
139139
}
140140
offset := len(keyVals) / bucketCnt
141141
perBucket := int(rowCount) / bucketCnt
142-
buckets := make([]*stats.Bucket, bucketCnt)
142+
buckets := make([]sql.HistogramBucket, bucketCnt)
143143
for i := range buckets {
144144
var upperBound []interface{}
145145
for _, v := range keyVals[i*offset] {

sql/analyzer/costed_index_scan.go

Lines changed: 86 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,13 @@ func getCostedIndexScan(ctx *sql.Context, statsProv sql.StatsProvider, rt sql.Ta
255255
retFilters = b.leftover
256256
}
257257

258-
return ret, c.bestStat, retFilters, nil
258+
bestStat, err := c.bestStat.WithHistogram(c.bestHist)
259+
if err != nil {
260+
return nil, nil, nil, err
261+
}
262+
bestStat = stats.UpdateCounts(bestStat)
263+
264+
return ret, bestStat, retFilters, nil
259265
}
260266

261267
func addIndexScans(m *memo.Memo) error {
@@ -363,6 +369,8 @@ type indexCoster struct {
363369
idToExpr map[indexScanId]sql.Expression
364370
// bestStat is the lowest cardinality indexScan option
365371
bestStat sql.Statistic
372+
bestHist []sql.HistogramBucket
373+
bestCnt uint64
366374
// bestFilters is the set of conjunctions used to create bestStat
367375
bestFilters sql.FastIntSet
368376
// bestConstant are the constant best filters
@@ -377,29 +385,30 @@ type indexCoster struct {
377385
func (c *indexCoster) cost(f indexFilter, stat sql.Statistic, idx sql.Index) error {
378386
ordinals := ordinalsForStat(stat)
379387

380-
newStat := stat
388+
var newHist []sql.HistogramBucket
389+
var newFds *sql.FuncDepSet
381390
var filters sql.FastIntSet
382391
var prefix int
383392
var err error
384393
var ok bool
385394

386395
switch f := f.(type) {
387396
case *iScanAnd:
388-
newStat, filters, prefix, err = c.costIndexScanAnd(f, stat, ordinals, idx)
397+
newHist, newFds, filters, prefix, err = c.costIndexScanAnd(f, stat, stat.Histogram(), ordinals, idx)
389398
if err != nil {
390399
return err
391400
}
392401

393402
case *iScanOr:
394-
newStat, ok, err = c.costIndexScanOr(f, stat, ordinals, idx)
403+
newHist, newFds, ok, err = c.costIndexScanOr(f, stat, stat.Histogram(), ordinals, idx)
395404
if err != nil {
396405
return err
397406
}
398407
if ok {
399408
filters.Add(int(f.id))
400409
}
401410
case *iScanLeaf:
402-
newStat, ok, prefix, err = c.costIndexScanLeaf(f, stat, ordinals, idx)
411+
newHist, newFds, ok, prefix, err = c.costIndexScanLeaf(f, stat, stat.Histogram(), ordinals, idx)
403412
if err != nil {
404413
return err
405414
}
@@ -410,25 +419,33 @@ func (c *indexCoster) cost(f indexFilter, stat sql.Statistic, idx sql.Index) err
410419
panic("unreachable")
411420
}
412421

413-
c.updateBest(newStat, filters, prefix)
422+
if newFds == nil {
423+
newFds = &sql.FuncDepSet{}
424+
}
425+
426+
c.updateBest(stat, newHist, newFds, filters, prefix)
427+
414428
return nil
415429
}
416430

417-
func (c *indexCoster) updateBest(s sql.Statistic, filters sql.FastIntSet, prefix int) {
431+
func (c *indexCoster) updateBest(s sql.Statistic, hist []sql.HistogramBucket, fds *sql.FuncDepSet, filters sql.FastIntSet, prefix int) {
418432
if s == nil || filters.Len() == 0 {
419433
return
420434
}
435+
rowCnt, _, _ := stats.GetNewCounts(hist)
421436

422437
var update bool
423438
defer func() {
424439
if update {
425-
c.bestStat = s
440+
c.bestStat = s.WithFuncDeps(fds)
441+
c.bestHist = hist
442+
c.bestCnt = rowCnt
426443
c.bestFilters = filters
427444
c.bestPrefix = prefix
428445
}
429446
}()
430447

431-
if c.bestStat == nil || s.RowCount() < c.bestStat.RowCount() {
448+
if c.bestStat == nil || rowCnt < c.bestCnt {
432449
update = true
433450
return
434451
} else if c.bestStat.FuncDeps().HasMax1Row() {
@@ -437,9 +454,9 @@ func (c *indexCoster) updateBest(s sql.Statistic, filters sql.FastIntSet, prefix
437454
// any prefix is better than no prefix
438455
update = prefix > c.bestPrefix
439456
return
440-
} else if s.RowCount() == c.bestStat.RowCount() {
457+
} else if rowCnt == c.bestCnt {
441458
// hand rules when stats don't exist or match exactly
442-
cmp := s.FuncDeps()
459+
cmp := fds
443460
best := c.bestStat.FuncDeps()
444461
if cmp.HasMax1Row() {
445462
update = true
@@ -1111,28 +1128,34 @@ func ordinalsForStat(stat sql.Statistic) map[string]int {
11111128
// updated statistic, the subset of applicable filters, the maximum prefix
11121129
// key created by a subset of equality filters (from conjunction only),
11131130
// or an error if applicable.
1114-
func (c *indexCoster) costIndexScanAnd(filter *iScanAnd, s sql.Statistic, ordinals map[string]int, idx sql.Index) (sql.Statistic, sql.FastIntSet, int, error) {
1131+
func (c *indexCoster) costIndexScanAnd(filter *iScanAnd, s sql.Statistic, buckets []sql.HistogramBucket, ordinals map[string]int, idx sql.Index) ([]sql.HistogramBucket, *sql.FuncDepSet, sql.FastIntSet, int, error) {
11151132
// first step finds the conjunctions that match index prefix columns.
11161133
// we divide into eqFilters and rangeFilters
11171134

1118-
ret := s
1135+
ret := s.Histogram()
11191136
var exact sql.FastIntSet
11201137

11211138
if len(filter.orChildren) > 0 {
11221139
for _, or := range filter.orChildren {
1123-
childStat, ok, err := c.costIndexScanOr(or.(*iScanOr), s, ordinals, idx)
1140+
childStat, fds, ok, err := c.costIndexScanOr(or.(*iScanOr), s, buckets, ordinals, idx)
11241141
if err != nil {
1125-
return nil, sql.FastIntSet{}, 0, err
1142+
return nil, nil, sql.FastIntSet{}, 0, err
11261143
}
11271144
// if valid, INTERSECT
11281145
if ok {
1129-
ret = stats.Intersect(ret, childStat)
1146+
if fds != nil {
1147+
s = s.WithFuncDeps(fds)
1148+
}
1149+
ret, err = stats.Intersect(ret, childStat, s.Types())
1150+
if err != nil {
1151+
return nil, nil, sql.FastIntSet{}, 0, err
1152+
}
11301153
exact.Add(int(or.Id()))
11311154
}
11321155
}
11331156
}
11341157

1135-
conj := newConjCollector(ret, ordinals)
1158+
conj := newConjCollector(s, ret, ordinals)
11361159
for _, c := range s.Columns() {
11371160
if colFilters, ok := filter.leafChildren[c]; ok {
11381161
for _, f := range colFilters {
@@ -1143,46 +1166,58 @@ func (c *indexCoster) costIndexScanAnd(filter *iScanAnd, s sql.Statistic, ordina
11431166

11441167
if exact.Len()+conj.applied.Len() == filter.childCnt() {
11451168
// matched all filters
1146-
return conj.stat, sql.NewFastIntSet(int(filter.id)), conj.missingPrefix, nil
1169+
return conj.hist, conj.fds, sql.NewFastIntSet(int(filter.id)), conj.missingPrefix, nil
11471170
}
11481171

1149-
return conj.stat, exact.Union(conj.applied), conj.missingPrefix, nil
1172+
return conj.hist, conj.fds, exact.Union(conj.applied), conj.missingPrefix, nil
11501173
}
11511174

1152-
func (c *indexCoster) costIndexScanOr(filter *iScanOr, s sql.Statistic, ordinals map[string]int, idx sql.Index) (sql.Statistic, bool, error) {
1175+
func (c *indexCoster) costIndexScanOr(filter *iScanOr, s sql.Statistic, buckets []sql.HistogramBucket, ordinals map[string]int, idx sql.Index) ([]sql.HistogramBucket, *sql.FuncDepSet, bool, error) {
11531176
// OR just unions the statistics from each child?
11541177
// if one of the children is invalid, we balk and return false
11551178
// otherwise we union the buckets between the children
1156-
ret := s
1179+
ret := buckets
11571180
for _, child := range filter.children {
11581181
switch child := child.(type) {
11591182
case *iScanAnd:
1160-
childStat, ids, _, err := c.costIndexScanAnd(child, s, ordinals, idx)
1183+
childBuckets, fds, ids, _, err := c.costIndexScanAnd(child, s, buckets, ordinals, idx)
11611184
if err != nil {
1162-
return nil, false, err
1185+
return nil, nil, false, err
11631186
}
11641187
if ids.Len() != 1 || !ids.Contains(int(child.Id())) {
11651188
// scan option missed some filters
1166-
return nil, false, nil
1189+
return nil, nil, false, nil
1190+
}
1191+
if fds != nil {
1192+
s = s.WithFuncDeps(fds)
1193+
}
1194+
ret, err = stats.Union(buckets, childBuckets, s.Types())
1195+
if err != nil {
1196+
return nil, nil, false, err
11671197
}
1168-
ret = stats.Union(s, childStat)
11691198

11701199
case *iScanLeaf:
11711200
var ok bool
1172-
childStat, ok, _, err := c.costIndexScanLeaf(child, s, ordinals, idx)
1201+
childBuckets, fds, ok, _, err := c.costIndexScanLeaf(child, s, ret, ordinals, idx)
11731202
if err != nil {
1174-
return nil, false, err
1203+
return nil, nil, false, err
11751204
}
11761205
if !ok {
1177-
return nil, false, nil
1206+
return nil, nil, false, nil
1207+
}
1208+
if fds != nil {
1209+
s = s.WithFuncDeps(fds)
1210+
}
1211+
ret, err = stats.Union(ret, childBuckets, s.Types())
1212+
if err != nil {
1213+
return nil, nil, false, err
11781214
}
1179-
ret = stats.Union(s, childStat)
11801215

11811216
default:
1182-
return nil, false, fmt.Errorf("invalid *iScanOr child: %T", child)
1217+
return nil, nil, false, fmt.Errorf("invalid *iScanOr child: %T", child)
11831218
}
11841219
}
1185-
return ret, true, nil
1220+
return ret, nil, true, nil
11861221
}
11871222

11881223
// indexHasContentHashedFieldForFilter returns true if the given index |idx| has a content-hashed field that is used
@@ -1212,10 +1247,10 @@ func indexHasContentHashedFieldForFilter(filter *iScanLeaf, idx sql.Index, ordin
12121247
// costIndexScanLeaf tries to apply a leaf filter to an index represented
12131248
// by a statistic, returning the updated statistic, whether the filter was
12141249
// applicable, and the maximum prefix key (0 or 1 for a leaf).
1215-
func (c *indexCoster) costIndexScanLeaf(filter *iScanLeaf, s sql.Statistic, ordinals map[string]int, idx sql.Index) (sql.Statistic, bool, int, error) {
1250+
func (c *indexCoster) costIndexScanLeaf(filter *iScanLeaf, s sql.Statistic, buckets []sql.HistogramBucket, ordinals map[string]int, idx sql.Index) ([]sql.HistogramBucket, *sql.FuncDepSet, bool, int, error) {
12161251
ord, ok := ordinals[strings.ToLower(filter.gf.Name())]
12171252
if !ok {
1218-
return nil, false, 0, nil
1253+
return nil, nil, false, 0, nil
12191254
}
12201255

12211256
// indexes with content-hashed fields can be used to test equality or compare with NULL,
@@ -1224,21 +1259,21 @@ func (c *indexCoster) costIndexScanLeaf(filter *iScanLeaf, s sql.Statistic, ordi
12241259
switch filter.op {
12251260
case indexScanOpEq, indexScanOpNotEq, indexScanOpNullSafeEq, indexScanOpIsNull, indexScanOpIsNotNull:
12261261
default:
1227-
return nil, false, 0, nil
1262+
return nil, nil, false, 0, nil
12281263
}
12291264
}
12301265

12311266
switch filter.op {
12321267
case indexScanOpSpatialEq:
12331268
stat, ok, err := c.costSpatial(filter, s, ord)
1234-
return stat, ok, 0, err
1269+
return buckets, stat.FuncDeps(), ok, 0, err
12351270
case indexScanOpFulltextEq:
12361271
stat, ok, err := c.costFulltext(filter, s, ord)
1237-
return stat, ok, 0, err
1272+
return buckets, stat.FuncDeps(), ok, 0, err
12381273
default:
1239-
conj := newConjCollector(s, ordinals)
1274+
conj := newConjCollector(s, buckets, ordinals)
12401275
conj.add(filter)
1241-
return conj.stat, true, conj.missingPrefix, nil
1276+
return conj.hist, conj.fds, true, conj.missingPrefix, nil
12421277
}
12431278
}
12441279

@@ -1521,9 +1556,11 @@ func newUniformDistStatistic(dbName, tableName string, sch sql.Schema, idx sql.I
15211556
return ret, nil
15221557
}
15231558

1524-
func newConjCollector(s sql.Statistic, ordinals map[string]int) *conjCollector {
1559+
func newConjCollector(s sql.Statistic, hist []sql.HistogramBucket, ordinals map[string]int) *conjCollector {
15251560
return &conjCollector{
15261561
stat: s,
1562+
hist: hist,
1563+
fds: s.FuncDeps(),
15271564
ordinals: ordinals,
15281565
eqVals: make([]interface{}, len(ordinals)),
15291566
nullable: make([]bool, len(ordinals)),
@@ -1534,6 +1571,8 @@ func newConjCollector(s sql.Statistic, ordinals map[string]int) *conjCollector {
15341571
// an index histogram for a list of conjugate filters
15351572
type conjCollector struct {
15361573
stat sql.Statistic
1574+
hist []sql.HistogramBucket
1575+
fds *sql.FuncDepSet
15371576
ordinals map[string]int
15381577
missingPrefix int
15391578
constant sql.FastIntSet
@@ -1587,7 +1626,7 @@ func (c *conjCollector) addEq(col string, val interface{}, nullSafe bool) error
15871626

15881627
// truncate buckets
15891628
var err error
1590-
c.stat, err = stats.PrefixKey(c.stat, c.eqVals[:ord+1], c.nullable)
1629+
c.hist, c.fds, err = stats.PrefixKey(c.stat.Histogram(), c.stat.ColSet(), c.stat.Types(), c.stat.FuncDeps(), c.eqVals[:ord+1], c.nullable)
15911630
if err != nil {
15921631
return err
15931632
}
@@ -1619,19 +1658,19 @@ func (c *conjCollector) cmpFirstCol(op indexScanOp, val interface{}) error {
16191658
switch op {
16201659
case indexScanOpNotEq:
16211660
// todo notEq
1622-
c.stat, err = stats.PrefixGt(c.stat, val)
1661+
c.hist, err = stats.PrefixGt(c.hist, c.stat.Types(), val)
16231662
case indexScanOpGt:
1624-
c.stat, err = stats.PrefixGt(c.stat, val)
1663+
c.hist, err = stats.PrefixGt(c.hist, c.stat.Types(), val)
16251664
case indexScanOpGte:
1626-
c.stat, err = stats.PrefixGte(c.stat, val)
1665+
c.hist, err = stats.PrefixGte(c.hist, c.stat.Types(), val)
16271666
case indexScanOpLt:
1628-
c.stat, err = stats.PrefixLt(c.stat, val)
1667+
c.hist, err = stats.PrefixLt(c.hist, c.stat.Types(), val)
16291668
case indexScanOpLte:
1630-
c.stat, err = stats.PrefixLte(c.stat, val)
1669+
c.hist, err = stats.PrefixLte(c.hist, c.stat.Types(), val)
16311670
case indexScanOpIsNull:
1632-
c.stat, err = stats.PrefixIsNull(c.stat)
1671+
c.hist, err = stats.PrefixIsNull(c.hist)
16331672
case indexScanOpIsNotNull:
1634-
c.stat, err = stats.PrefixIsNotNull(c.stat)
1673+
c.hist, err = stats.PrefixIsNotNull(c.hist)
16351674
}
16361675
return err
16371676
}

0 commit comments

Comments
 (0)