Skip to content

Commit bd454ea

Browse files
committed
Add cli for checking CI test flakiness
1 parent 541e548 commit bd454ea

File tree

6 files changed

+574
-6
lines changed

6 files changed

+574
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/flakechecker
12
/junit2jira
23
.idea
34
# Binaries for programs and plugins

cmd/flakechecker/bq_client.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package main
2+
3+
import (
4+
"cloud.google.com/go/bigquery"
5+
"context"
6+
"github.com/pkg/errors"
7+
log "github.com/sirupsen/logrus"
8+
"google.golang.org/api/iterator"
9+
"time"
10+
)
11+
12+
const projectID = "acs-san-stackroxci"
13+
const queryTimeout = 1 * time.Minute
14+
const queryStrGetFailureRatio = `
15+
SELECT
16+
JobName,
17+
FilteredName,
18+
Classname,
19+
TotalAll,
20+
FailRatio
21+
FROM
22+
` + "`acs-san-stackroxci.ci_metrics.stackrox_tests_recent_flaky_tests`" + `
23+
WHERE
24+
JobName = @jobName
25+
AND FilteredName = @filteredName
26+
AND Classname = @classname
27+
`
28+
29+
type biqQueryClient interface {
30+
GetRatioForTest(flakeTestConfig *flakeDetectionPolicy, testName string) (int, int, error)
31+
}
32+
33+
type biqQueryClientImpl struct {
34+
client *bigquery.Client
35+
}
36+
37+
func getNewBigQueryClient() (biqQueryClient, error) {
38+
ctx := context.Background()
39+
40+
client, err := bigquery.NewClient(ctx, projectID)
41+
if err != nil {
42+
return nil, errors.Wrap(err, "creating BigQuery client")
43+
}
44+
45+
return &biqQueryClientImpl{client: client}, nil
46+
}
47+
48+
func (c *biqQueryClientImpl) GetRatioForTest(flakeTestRec *flakeDetectionPolicy, testName string) (int, int, error) {
49+
query := c.client.Query(queryStrGetFailureRatio)
50+
query.Parameters = []bigquery.QueryParameter{
51+
{Name: "jobName", Value: flakeTestRec.config.RatioJobName},
52+
{Name: "filteredName", Value: testName},
53+
{Name: "classname", Value: flakeTestRec.config.Classname},
54+
}
55+
56+
ctx, cancelBigQueryRequest := context.WithTimeout(context.Background(), queryTimeout)
57+
defer cancelBigQueryRequest()
58+
59+
resIter, err := query.Read(ctx)
60+
if err != nil {
61+
return 0, 0, errors.Wrap(err, "query data from BigQuery")
62+
}
63+
64+
// We need only first flakyTestInfo. No need to loop over iterator.
65+
var flakyTestInfo recentFlakyTestInfo
66+
if errNext := resIter.Next(&flakyTestInfo); errNext != nil {
67+
return 0, 0, errors.Wrap(errNext, "read BigQuery recentFlakyTestInfo")
68+
}
69+
70+
if errNext := resIter.Next(&flakyTestInfo); !errors.Is(errNext, iterator.Done) {
71+
log.Warnf("Expected to find one row in DB, but got more for query params: %v - query: %s", query.Parameters, queryStrGetFailureRatio)
72+
}
73+
74+
return flakyTestInfo.TotalAll, flakyTestInfo.FailRatio, nil
75+
}

cmd/flakechecker/flake_config.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"github.com/pkg/errors"
7+
"io"
8+
"os"
9+
"regexp"
10+
)
11+
12+
// flakeDetectionPolicyConfig represents configuration used by flakechecker to evaluate failed tests.
13+
//
14+
// It contains the following fields:
15+
// match_job_name - name of the job that should be evaluated by flakechecker. i.e. (branch should be evaluated, but main not)
16+
// ratio_job_name - job name that should be used for ratio calculation. i.e. we take main branch test runs as base for evaluation of flake ratio
17+
// test_name_regex - regex used to match test names. Some test names contain detailed information (i.e. version 4.4.4), but we want to use ratio for all tests in that group (i.e. 4.4.z). Using regex allow us to group tests differently.
18+
// classname - class name of the test that should be isolated. With this option we can isolate single flake test from suite and isolate only that one from the rest.
19+
// ratio_threshold - failure percentage that is allowed for this test. This information is usually fetched from historical executions and data collected in DB.
20+
type flakeDetectionPolicyConfig struct {
21+
MatchJobName string `json:"match_job_name"`
22+
RatioJobName string `json:"ratio_job_name"`
23+
TestNameRegex string `json:"test_name_regex"`
24+
Classname string `json:"classname"`
25+
RatioThreshold int `json:"ratio_threshold"`
26+
}
27+
28+
type flakeDetectionPolicy struct {
29+
config *flakeDetectionPolicyConfig
30+
regexMatchJobName *regexp.Regexp
31+
regexTestNameRegex *regexp.Regexp
32+
}
33+
34+
func newFlakeDetectionPolicy(config *flakeDetectionPolicyConfig) (*flakeDetectionPolicy, error) {
35+
regexMatchJobName, err := regexp.Compile(fmt.Sprintf("^%s$", config.MatchJobName))
36+
if err != nil {
37+
return nil, errors.Wrap(err, fmt.Sprintf("invalid flake config match job regex: %v", config.MatchJobName))
38+
}
39+
40+
regexTestNameRegex, err := regexp.Compile(fmt.Sprintf("^%s$", config.TestNameRegex))
41+
if err != nil {
42+
return nil, errors.Wrap(err, fmt.Sprintf("invalid flake config test name regex: %v", config.TestNameRegex))
43+
}
44+
45+
return &flakeDetectionPolicy{
46+
config: config,
47+
regexMatchJobName: regexMatchJobName,
48+
regexTestNameRegex: regexTestNameRegex,
49+
}, nil
50+
}
51+
52+
// newFlakeDetectionPolicyMust - is primarily used in tests.
53+
func newFlakeDetectionPolicyMust(config *flakeDetectionPolicyConfig) *flakeDetectionPolicy {
54+
policy, err := newFlakeDetectionPolicy(config)
55+
if err != nil {
56+
panic(err)
57+
}
58+
59+
return policy
60+
}
61+
62+
func (r *flakeDetectionPolicy) matchJobName(jobName string) (bool, error) {
63+
return r.regexMatchJobName.MatchString(jobName), nil
64+
}
65+
66+
func (r *flakeDetectionPolicy) matchTestName(testName string) (bool, error) {
67+
return r.regexTestNameRegex.MatchString(testName), nil
68+
}
69+
70+
func (r *flakeDetectionPolicy) matchClassname(classname string) (bool, error) {
71+
return classname == r.config.Classname, nil
72+
}
73+
74+
func loadFlakeConfigFile(fileName string) ([]*flakeDetectionPolicy, error) {
75+
jsonConfigFile, err := os.Open(fileName)
76+
if err != nil {
77+
return nil, errors.Wrap(err, fmt.Sprintf("open flake config file: %s", fileName))
78+
}
79+
defer jsonConfigFile.Close()
80+
81+
jsonConfigFileData, err := io.ReadAll(jsonConfigFile)
82+
if err != nil {
83+
return nil, errors.Wrap(err, fmt.Sprintf("read flake config file: %s", fileName))
84+
}
85+
86+
flakeConfigs := make([]*flakeDetectionPolicyConfig, 0)
87+
err = json.Unmarshal(jsonConfigFileData, &flakeConfigs)
88+
if err != nil {
89+
return nil, errors.Wrap(err, fmt.Sprintf("parse flake config file: %s", fileName))
90+
}
91+
92+
detectionPolicies := make([]*flakeDetectionPolicy, 0, len(flakeConfigs))
93+
for _, flakeConfig := range flakeConfigs {
94+
detectionPolicy, errNewPolicy := newFlakeDetectionPolicy(flakeConfig)
95+
if errNewPolicy != nil {
96+
return nil, errors.Wrap(err, fmt.Sprintf("create flake detection policy from config: %v", flakeConfig))
97+
}
98+
99+
detectionPolicies = append(detectionPolicies, detectionPolicy)
100+
}
101+
102+
return detectionPolicies, nil
103+
}

cmd/flakechecker/main.go

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
package main
2+
3+
import (
4+
_ "embed"
5+
"flag"
6+
"fmt"
7+
"github.com/carlmjohnson/versioninfo"
8+
junit "github.com/joshdk/go-junit"
9+
"github.com/pkg/errors"
10+
log "github.com/sirupsen/logrus"
11+
"github.com/stackrox/junit2jira/pkg/testcase"
12+
"os"
13+
)
14+
15+
const totalRunsLimit = 30
16+
17+
const errDescNoMatch = "there is no match in allowed flake tests"
18+
const errDescAboveThreshold = "allowed flake ratio for test is above threshold"
19+
const errDescShortHistory = "total runs for test is under history count threshold"
20+
const errDescGetRatio = "get ratio for test failed"
21+
22+
type flakeCheckerParams struct {
23+
junitReportsDir string
24+
configFile string
25+
26+
jobName string
27+
orchestrator string
28+
29+
dryRun bool
30+
}
31+
32+
func main() {
33+
var debug bool
34+
var err error
35+
36+
p := flakeCheckerParams{}
37+
flag.StringVar(&p.junitReportsDir, "junit-reports-dir", os.Getenv("ARTIFACT_DIR"), "Dir that contains jUnit reports XML files")
38+
flag.StringVar(&p.configFile, "config-file", "", "Config file with defined failure ratios")
39+
40+
flag.StringVar(&p.jobName, "job-name", "", "Name of CI job.")
41+
flag.StringVar(&p.orchestrator, "orchestrator", "", "orchestrator name (such as GKE or OpenShift), if any.")
42+
43+
flag.BoolVar(&p.dryRun, "dry-run", false, "When set to true issues will NOT be created.")
44+
flag.BoolVar(&debug, "debug", false, "Enable debug log level")
45+
versioninfo.AddFlag(flag.CommandLine)
46+
flag.Parse()
47+
48+
if debug {
49+
log.SetLevel(log.DebugLevel)
50+
}
51+
52+
err = p.run()
53+
if err != nil {
54+
log.Fatal(err)
55+
}
56+
}
57+
58+
type recentFlakyTestInfo struct {
59+
JobName string
60+
FilteredName string
61+
Classname string
62+
TotalAll int
63+
FailRatio int
64+
}
65+
66+
func (p *flakeCheckerParams) checkFailedTests(bqClient biqQueryClient, failedTests []testcase.TestCase, flakeCheckerRecs []*flakeDetectionPolicy) error {
67+
for _, failedTest := range failedTests {
68+
found := false
69+
log.Infof("Checking failed test: %q / %q / %q", p.jobName, failedTest.Name, failedTest.Classname)
70+
for _, flakeCheckerRec := range flakeCheckerRecs {
71+
match, err := flakeCheckerRec.matchJobName(p.jobName)
72+
if err != nil {
73+
return err
74+
}
75+
76+
if !match {
77+
continue
78+
}
79+
80+
match, err = flakeCheckerRec.matchTestName(failedTest.Name)
81+
if err != nil {
82+
return err
83+
}
84+
85+
if !match {
86+
continue
87+
}
88+
89+
match, err = flakeCheckerRec.matchClassname(failedTest.Classname)
90+
if err != nil {
91+
return err
92+
}
93+
94+
if !match {
95+
continue
96+
}
97+
98+
found = true
99+
log.Infof("Match found: %q / %q / %q", flakeCheckerRec.config.MatchJobName, flakeCheckerRec.config.TestNameRegex, flakeCheckerRec.config.Classname)
100+
totalRuns, failRatio, err := bqClient.GetRatioForTest(flakeCheckerRec, failedTest.Name)
101+
if err != nil {
102+
return errors.Wrap(err, errDescGetRatio)
103+
}
104+
105+
if totalRuns < totalRunsLimit {
106+
return errors.Wrap(fmt.Errorf("%d", totalRuns), errDescShortHistory)
107+
}
108+
109+
if failRatio > flakeCheckerRec.config.RatioThreshold {
110+
return errors.Wrap(fmt.Errorf("(%d > %d)", failRatio, flakeCheckerRec.config.RatioThreshold), errDescAboveThreshold)
111+
}
112+
113+
log.Infof("Ratio is below threshold: (%d <= %d)", failRatio, flakeCheckerRec.config.RatioThreshold)
114+
}
115+
116+
if !found {
117+
return errors.Wrap(errors.New(failedTest.Name), errDescNoMatch)
118+
}
119+
}
120+
121+
return nil
122+
}
123+
124+
func (p *flakeCheckerParams) run() error {
125+
testSuites, err := junit.IngestDir(p.junitReportsDir)
126+
if err != nil {
127+
log.Fatalf("could not read files: %s", err)
128+
}
129+
130+
failedTests, err := testcase.GetFailedTests(testSuites)
131+
if err != nil {
132+
return errors.Wrap(err, "could not find failed tests")
133+
}
134+
135+
if len(failedTests) == 0 {
136+
log.Info("No failed tests to process")
137+
return nil
138+
}
139+
140+
log.Infof("Found %d failed tests", len(failedTests))
141+
142+
flakeConfigs, err := loadFlakeConfigFile(p.configFile)
143+
if err != nil {
144+
log.Fatalf("unable to load config file (%s): %s", p.configFile, err)
145+
}
146+
147+
bqClient, err := getNewBigQueryClient()
148+
if err != nil {
149+
log.Fatalf("unable to create BigQuery client: %s", err)
150+
}
151+
152+
if err = p.checkFailedTests(bqClient, failedTests, flakeConfigs); err != nil {
153+
log.Fatal(err)
154+
}
155+
156+
log.Info("All failed tests are within allowed flake thresholds")
157+
158+
return nil
159+
}

0 commit comments

Comments
 (0)