stackrox
diff --git a/‎.github/workflows/go.yml‎
Lines changed: 7 additions & 2 deletions b/‎.github/workflows/go.yml‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 54 additions & 2 deletions b/‎README.md‎
Lines changed: 54 additions & 2 deletions
diff --git a/‎cmd/flakechecker/bq_client.go‎
Lines changed: 76 additions & 0 deletions b/‎cmd/flakechecker/bq_client.go‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎cmd/flakechecker/flake_config.go‎
Lines changed: 109 additions & 0 deletions b/‎cmd/flakechecker/flake_config.go‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎cmd/flakechecker/flake_config_test.go‎
Lines changed: 68 additions & 0 deletions b/‎cmd/flakechecker/flake_config_test.go‎
Lines changed: 68 additions & 0 deletions
@@ -29,7 +29,9 @@ jobs:
     - name: Compress binaries
       uses: svenstaro/upx-action@v2
       with:
-        file: junit2jira
+        files: |
+          junit2jira
+          flakechecker
 
     - name: Test
       run: go test -v ./...
@@ -38,7 +40,9 @@ jobs:
       uses: actions/upload-artifact@v3
       with:
         name: junit2jira
-        path: junit2jira
+        path: |
+          junit2jira
+          flakechecker
 
   release:
     if: startsWith(github.ref, 'refs/tags/')
@@ -55,3 +59,4 @@ jobs:
         with:
           files: |
             junit2jira
+            flakechecker
@@ -1,3 +1,4 @@
+/flakechecker
 /junit2jira
 .idea
 # Binaries for programs and plugins
 
@@ -1,6 +1,6 @@
 # junit2jira
 
-Convert test failures to jira issues
+Utility tools for handling test failures
 
 ### Build
 ```shell
@@ -14,6 +14,16 @@ go test ./...
 
 ### Usage
 
+This repo provides two cli tools:
+- junit2jira
+- flakechecker
+
+### junit2jira
+
+`junit2jira` supports conversion of test failures to jira issues. It also posts Slack messages for new failures and imports test results into DB.
+
+*Usage*
+
 ```shell
 Usage of junit2jira:
   -base-link string
@@ -51,7 +61,7 @@ Usage of junit2jira:
     	print version information and exit
 ```
 
-## Example usage
+*Example usage*
 ```shell
 JIRA_TOKEN="..." junit2jira \
   -jira-url "https://..." \
@@ -65,3 +75,45 @@ JIRA_TOKEN="..." junit2jira \
   -timestamp $(date --rfc-3339=seconds)
   -csv-output -
 ```
+
+### flakechecker
+
+`flakechecker` helps prevent unnecessary CI pipeline failures by suppressing known flaky tests that are within the allowed failure thresholds.
+
+`flakechecker` relies on several components:
+- collected test results from `junit2jira`: we generate a table of flaky tests, including their failure ratios for the last 30 executions.
+- flaky test configuration: we define and provide a `flakechecker` configuration with allowed failure ratio thresholds for known flaky tests.
+- CI pipeline integration script: `flakechecker` is executed as the last step in a CI pipeline, and provided results allow the CI pipeline script to report success or failure.
+
+ The `flakechecker` expects at least one failed test. It will return an error if it is executed on test results without any failures.
+
+`flakechecker` decision making:
+- it checks if a failed test in a CI pipeline is listed as flaky in the provided configuration.
+- if the test is not found in the flaky tests config -> it will cause the CI pipeline to fail. (test not found)
+- if the test is found in the configuration, `flakechecker` will fetch information about the fail ratio for that test from the database. If we have fewer than 30 executions for that test -> it will cause the CI pipeline to fail. (insufficient historical test results)
+- if the test's failure ratio in the database exceeds the threshold defined in the config -> it will cause the CI pipeline to fail. (flake ratio is above the allowed threshold)
+- if a flaky test's failure ratio is below the defined threshold -> it will report the test as a success in the CI pipeline. (test suppression)
+
+The `flakechecker` will apply this logic for each failed test in the CI pipeline.
+
+*Usage*
+
+```
+Usage of flakechecker:
+  -config-file string
+        Config file with allowed flakes.
+  -debug
+        Enable debug log level.
+  -job-name string
+        Name of CI job.
+  -junit-reports-dir string
+        Directory containing JUnit report XML files.
+  -v    short alias for -version
+  -version
+        print version information and exit
+```
+
+*Example usage*
+```
+flakechecker --config-file flake-config.yml --job-name "${JOB_NAME}" -junit-reports-dir "${ARTIFACT_DIR}"
+```
@@ -0,0 +1,76 @@
+package main
+
+import (
+	"cloud.google.com/go/bigquery"
+	"context"
+	"github.com/pkg/errors"
+	log "github.com/sirupsen/logrus"
+	"time"
+)
+
+const projectID = "acs-san-stackroxci"
+const queryTimeout = 1 * time.Minute
+const queryStrGetFailureRatio = `
+SELECT
+    TotalAll,
+    FailRatio
+FROM
+` + "`acs-san-stackroxci.ci_metrics.stackrox_tests__recent_flaky_tests`" + `
+WHERE
+    JobName = @jobName
+    AND Classname = @className
+    AND Name = @testName
+`
+
+type recentFlakyTestInfo struct {
+	TotalAll  int
+	FailRatio int
+}
+
+type biqQueryClient interface {
+	GetRatioForTest(config flakeDetectionPolicyConfig, testName string) (int, int, error)
+}
+
+type bigQueryClient struct {
+	client *bigquery.Client
+}
+
+func getNewBigQueryClient() (biqQueryClient, error) {
+	ctx := context.Background()
+
+	client, err := bigquery.NewClient(ctx, projectID)
+	if err != nil {
+		return nil, errors.Wrap(err, "creating BigQuery client")
+	}
+
+	return &bigQueryClient{client: client}, nil
+}
+
+func (c *bigQueryClient) GetRatioForTest(config flakeDetectionPolicyConfig, testName string) (int, int, error) {
+	query := c.client.Query(queryStrGetFailureRatio)
+	query.Parameters = []bigquery.QueryParameter{
+		{Name: "jobName", Value: config.RatioJobName},
+		{Name: "className", Value: config.ClassName},
+		{Name: "testName", Value: testName},
+	}
+
+	ctx, cancelBigQueryRequest := context.WithTimeout(context.Background(), queryTimeout)
+	defer cancelBigQueryRequest()
+
+	resIter, err := query.Read(ctx)
+	if err != nil {
+		return 0, 0, errors.Wrap(err, "query data from BigQuery")
+	}
+
+	// We need only first flakyTestInfo. No need to loop over iterator.
+	var flakyTestInfo recentFlakyTestInfo
+	if errNext := resIter.Next(&flakyTestInfo); errNext != nil {
+		return 0, 0, errors.Wrapf(errNext, "read BigQuery result for flaky test for query params: %v - query: %s", query.Parameters, queryStrGetFailureRatio)
+	}
+
+	if resIter.TotalRows > 1 {
+		log.Warnf("Expected to find one row in DB, but got more for query params: %v - query: %s", query.Parameters, queryStrGetFailureRatio)
+	}
+
+	return flakyTestInfo.TotalAll, flakyTestInfo.FailRatio, nil
+}
@@ -0,0 +1,109 @@
+package main
+
+import (
+	"fmt"
+	"github.com/pkg/errors"
+	"gopkg.in/yaml.v3"
+	"io"
+	"os"
+	"regexp"
+)
+
+// flakeDetectionPolicyConfig represents configuration used by flakechecker to evaluate failed tests.
+type flakeDetectionPolicyConfig struct {
+	// JobNameRegex is a regular expression for the name of the CI job that should be evaluated by flakechecker.
+	// (i.e. CI jobs for PRs should be evaluated, but not CI jobs for commits already merged to "main" branch)
+	JobNameRegex string `yaml:"jobNameRegex"`
+	// ClassName is class name of the test that should be isolated. Usually class name for Groovy tests,
+	// package name for golang tests, etc.
+	ClassName string `yaml:"className"`
+	// TestNameRegex is a regular expression used to match test names. Some test names contain detailed information
+	// (i.e. version 4.4.4), but we want to use ratio for all tests in that group (i.e. 4.4.z).
+	// Using a regex allow us to group tests as needed.
+	TestNameRegex string `yaml:"testNameRegex"`
+	// TestNameRegex is CI job name that should be used for ratio calculation.
+	// i.e. we take CI runs for commits on "main" branch as input for evaluation of flake ratio.
+	RatioJobName string `yaml:"ratioJobName"`
+	// RatioThreshold is the maximum failure percentage that is used to distinguish a flaky test from
+	// a completely broken test. This information is usually fetched from historical executions and data
+	// collected in DB. If measured flakiness exceeds this threshold, we no longer want to suppress test failure,
+	// because we suspect it might have regressed above what we consider acceptable.
+	RatioThreshold int `yaml:"ratioThreshold"`
+}
+
+type flakeDetectionPolicy struct {
+	config                flakeDetectionPolicyConfig
+	compiledJobNameRegex  *regexp.Regexp
+	compiledTestNameRegex *regexp.Regexp
+}
+
+func newFlakeDetectionPolicy(config flakeDetectionPolicyConfig) (*flakeDetectionPolicy, error) {
+	compiledJobNameRegex, err := regexp.Compile(fmt.Sprintf("^%s$", config.JobNameRegex))
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprintf("invalid flake config match job regex: %v", config.JobNameRegex))
+	}
+
+	compiledTestNameRegex, err := regexp.Compile(fmt.Sprintf("^%s$", config.TestNameRegex))
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprintf("invalid flake config test name regex: %v", config.TestNameRegex))
+	}
+
+	return &flakeDetectionPolicy{
+		config:                config,
+		compiledJobNameRegex:  compiledJobNameRegex,
+		compiledTestNameRegex: compiledTestNameRegex,
+	}, nil
+}
+
+func (r *flakeDetectionPolicy) matchJobName(jobName string) bool {
+	return r.compiledJobNameRegex.MatchString(jobName)
+}
+
+func (r *flakeDetectionPolicy) matchClassName(classname string) bool {
+	return classname == r.config.ClassName
+}
+
+func (r *flakeDetectionPolicy) matchTestName(testName string) bool {
+	return r.compiledTestNameRegex.MatchString(testName)
+}
+
+func findFlakeConfigForTest(flakeCheckerRecs []*flakeDetectionPolicy, jobName string, className string, testName string) (*flakeDetectionPolicy, error) {
+	for _, flakeCheckerRec := range flakeCheckerRecs {
+		if flakeCheckerRec.matchJobName(jobName) && flakeCheckerRec.matchClassName(className) && flakeCheckerRec.matchTestName(testName) {
+			return flakeCheckerRec, nil
+		}
+	}
+
+	return nil, errors.Wrap(errors.Errorf("%q / %q / %q", jobName, className, testName), errDescNoMatch)
+}
+
+func loadFlakeConfigFile(fileName string) ([]*flakeDetectionPolicy, error) {
+	ymlConfigFile, err := os.Open(fileName)
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprintf("open flake config file: %s", fileName))
+	}
+	defer ymlConfigFile.Close()
+
+	ymlConfigFileData, err := io.ReadAll(ymlConfigFile)
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprintf("read flake config file: %s", fileName))
+	}
+
+	flakeConfigs := make([]flakeDetectionPolicyConfig, 0)
+	err = yaml.Unmarshal(ymlConfigFileData, &flakeConfigs)
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprintf("parse flake config file: %s", fileName))
+	}
+
+	detectionPolicies := make([]*flakeDetectionPolicy, 0, len(flakeConfigs))
+	for _, flakeConfig := range flakeConfigs {
+		detectionPolicy, errNewPolicy := newFlakeDetectionPolicy(flakeConfig)
+		if errNewPolicy != nil {
+			return nil, errors.Wrap(err, fmt.Sprintf("create flake detection policy from config: %v", flakeConfig))
+		}
+
+		detectionPolicies = append(detectionPolicies, detectionPolicy)
+	}
+
+	return detectionPolicies, nil
+}
@@ -0,0 +1,68 @@
+package main
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func newFlakeDetectionPolicyMust(config flakeDetectionPolicyConfig) *flakeDetectionPolicy {
+	policy, err := newFlakeDetectionPolicy(config)
+	if err != nil {
+		panic(err)
+	}
+
+	return policy
+}
+
+func TestLoadFlakeConfigFile(t *testing.T) {
+	samples := []struct {
+		name     string
+		fileName string
+
+		expectError    bool
+		expectErrorStr string
+		expectConfig   []*flakeDetectionPolicy
+	}{
+		{
+			name:           "no config file",
+			fileName:       "no_config.yml",
+			expectError:    true,
+			expectErrorStr: "open flake config file: no_config.yml: open no_config.yml: no such file or directory",
+			expectConfig:   nil,
+		},
+		{
+			name:        "valid config file",
+			fileName:    "testdata/flake-config.yml",
+			expectError: false,
+			expectConfig: []*flakeDetectionPolicy{
+				newFlakeDetectionPolicyMust(flakeDetectionPolicyConfig{
+					JobNameRegex:   "pr-.*",
+					ClassName:      "TestLoadFlakeConfigFile",
+					TestNameRegex:  "TestLoadFlakeConf.*",
+					RatioJobName:   "main-branch-tests",
+					RatioThreshold: 5,
+				}),
+				newFlakeDetectionPolicyMust(flakeDetectionPolicyConfig{
+					JobNameRegex:   "pull-request-tests",
+					ClassName:      "TestLoadFlakeConfigFile",
+					TestNameRegex:  "TestLoadFlakeConfigFile",
+					RatioJobName:   "main-branch-tests",
+					RatioThreshold: 10,
+				}),
+			},
+		},
+	}
+
+	for _, sample := range samples {
+		t.Run(sample.name, func(tt *testing.T) {
+			config, err := loadFlakeConfigFile(sample.fileName)
+
+			if sample.expectError {
+				assert.EqualError(tt, err, sample.expectErrorStr)
+			} else {
+				assert.NoError(tt, err)
+			}
+			assert.Equal(tt, sample.expectConfig, config)
+		})
+	}
+}
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+/flakechecker`
`1`	`2`	`/junit2jira`
`2`	`3`	`.idea`
`3`	`4`	`# Binaries for programs and plugins`