diff --git a/config/config-matterwick.default.json b/config/config-matterwick.default.json index 08816fc..322c2f7 100644 --- a/config/config-matterwick.default.json +++ b/config/config-matterwick.default.json @@ -78,10 +78,11 @@ "E2EUsername": "admin", "E2EPassword": "", "E2EServerVersion": "latest", - "E2EAutoTriggerOnRelease": true, "E2EAutoTriggerOnMaster": true, "E2EReleasePatternPrefix": "release-", - "E2ENightlyTriggerWorkflowName": "E2E Nightly Trigger", "E2ETestWorkflowNames": ["Electron Playwright Tests", "E2E", "Compatibility Matrix Testing"], - "E2EInstanceMaxAge": 6 + "E2EInstanceMaxAge": 6, + "E2EPRInstanceMaxAge": 24, + "CMTTriggerWorkflowName": "CMT Provisioner", + "CMTServerVersions": ["10.11.18", "11.7.1"] } diff --git a/server/config.go b/server/config.go index 88b894d..0767228 100644 --- a/server/config.go +++ b/server/config.go @@ -112,19 +112,42 @@ type MatterwickConfig struct { E2EUsername string E2EPassword string E2EServerVersion string - E2EAutoTriggerOnRelease bool - E2EAutoTriggerOnMaster bool - E2EReleasePatternPrefix string - E2ENightlyTriggerWorkflowName string // workflow name (name: field) of the nightly trigger workflow - E2ETestWorkflowNames []string // workflow names of the actual test workflows (for completion-based cleanup) + E2EAutoTriggerOnMaster bool + E2EReleasePatternPrefix string + E2ETestWorkflowNames []string // workflow names of the actual test workflows (for completion-based cleanup) // E2EInstanceMaxAge is the minimum age (in hours) a non-PR E2E instance must reach // before the periodic orphan-cleanup scan will delete it. This prevents the scan // from destroying instances that are still being used by a currently-running test. // Set to the longest expected E2E run duration plus a small buffer. // Default (0): 3 hours. E2EInstanceMaxAge int + + // E2EPRInstanceMaxAge is the maximum age (in hours) a PR E2E instance may reach before + // the periodic cleanup scan deletes it. PR instances are intentionally kept alive between + // label toggles and across commits so the same servers can be reused for re-runs, so this + // is much longer than E2EInstanceMaxAge. When such an instance is reaped its in-memory + // tracking entry is also evicted, so re-applying E2E/Run provisions a fresh set. + // Default (0): 24 hours. + E2EPRInstanceMaxAge int + + // CMTTriggerWorkflowName is the workflow name (the "name:" field) of the lightweight, + // scheduled CMT trigger workflow in the desktop/mobile repos. When matterwick receives + // a workflow_run "requested" event for this workflow it provisions one instance per + // version in CMTServerVersions and dispatches compatibility-matrix-testing.yml. + CMTTriggerWorkflowName string + + // CMTServerVersions is an OPTIONAL manual override for the CMT version set. When non-empty + // it is used verbatim (values must be valid Mattermost image tags: full semver, no "v" + // prefix, e.g. "10.11.0"). When empty (the normal case) matterwick auto-derives the set + // from Mattermost's GitHub releases via Server.resolveCMTServerVersions. + CMTServerVersions []string } +// defaultCMTServerVersions is the fallback CMT version set used only when auto-resolution +// fails (GitHub API error) and no manual override is configured. Kept reasonably current: +// the active v10.11 ESR plus a recent v11 release. +var defaultCMTServerVersions = []string{"10.11.18", "11.7.1"} + func findConfigFile(fileName string) string { if _, err := os.Stat("/tmp/" + fileName); err == nil { fileName, _ = filepath.Abs("/tmp/" + fileName) diff --git a/server/e2e_dryrun_test.go b/server/e2e_dryrun_test.go index d778257..37768ab 100644 --- a/server/e2e_dryrun_test.go +++ b/server/e2e_dryrun_test.go @@ -26,6 +26,7 @@ import ( "strings" "sync" "testing" + "time" gogithub "github.com/google/go-github/v32/github" "github.com/mattermost/matterwick/model" @@ -532,13 +533,14 @@ func TestDryRun_DesktopCMT(t *testing.T) { assert.Equal(t, []string{"v11.1.0", "v11.2.0", "v12.0.0"}, versions) }) - t.Run("caps server versions to 5", func(t *testing.T) { - versions := parseServerVersionsFromString("v1, v2, v3, v4, v5, v6, v7") - // The cap is enforced inside handleCMTWithServerVersions - if len(versions) > 5 { - versions = versions[:5] + t.Run("caps server versions to 10", func(t *testing.T) { + versions := parseServerVersionsFromString("v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12") + // The cap is enforced inside handleCMTWithServerVersions (maxVersions = 10). + const maxVersions = 10 + if len(versions) > maxVersions { + versions = versions[:maxVersions] } - assert.Len(t, versions, 5) + assert.Len(t, versions, maxVersions) }) t.Run("1 instance per version for CMT (matrix handles parallelism)", func(t *testing.T) { @@ -570,9 +572,15 @@ func TestDryRun_DesktopCMT(t *testing.T) { s0 := servers[0].(map[string]interface{}) assert.Equal(t, "v11.1.0", s0["version"]) assert.Equal(t, "https://v1.example.com", s0["url"]) + // Desktop ignores the `latest` field; cmtServer.Latest is shared with mobile but is + // never set on the desktop path, and `omitempty` keeps it out of the JSON entirely. + _, has0 := s0["latest"] + assert.False(t, has0, "desktop matrix must not carry the `latest` field") s1 := servers[1].(map[string]interface{}) assert.Equal(t, "v11.2.0", s1["version"]) assert.Equal(t, "https://v2.example.com", s1["url"]) + _, has1 := s1["latest"] + assert.False(t, has1, "desktop matrix must not carry the `latest` field") }) t.Run("CMT dispatches compatibility-matrix-testing.yml once", func(t *testing.T) { @@ -629,9 +637,16 @@ func TestDryRun_MobileCMT(t *testing.T) { s0 := servers[0].(map[string]interface{}) assert.Equal(t, "v11.1.0", s0["version"]) assert.Equal(t, "https://v1.example.com", s0["url"]) + // Older version: `latest` is omitted entirely (cmtServer.Latest is false, omitempty). + _, has0 := s0["latest"] + assert.False(t, has0, "older mobile entries must not carry the `latest` field") s1 := servers[1].(map[string]interface{}) assert.Equal(t, "v11.2.0", s1["version"]) assert.Equal(t, "https://v2.example.com", s1["url"]) + // Highest semver gets `latest: true`. The mobile workflow uses this to decide whether + // to run the whole suite (latest) or just smoke (older) — that policy lives there, + // not in matterwick. + assert.Equal(t, true, s1["latest"]) }) t.Run("mobile CMT dispatches once not once per version", func(t *testing.T) { @@ -668,6 +683,86 @@ func TestDryRun_MobileCMT(t *testing.T) { } assert.Equal(t, len(versions), len(instances), "one instance per version") }) + + t.Run("mobile CMT marks the highest-semver entry as latest", func(t *testing.T) { + // 5-element resolved set (today's typical shape): ESR + 3 minors + current RC. + // Across the boundary cases that matter: ESR is older despite high patch numbers; + // RC vs stable for the same X.Y.Z should treat stable as higher; multi-digit RC + // numbers (rc.10 > rc.2). Locking these in so the workflow's latest gate doesn't + // silently shift if someone tweaks the comparator. + cases := []struct { + name string + versions []string + wantLatestIdx int + }{ + { + name: "ESR + 3 minors + RC: RC's base is newest so RC is latest", + versions: []string{"10.11.19", "11.5.7", "11.6.4", "11.7.2", "11.8.0-rc3"}, + wantLatestIdx: 4, + }, + { + name: "ESR with high patch loses to lower-patch newer minor", + versions: []string{"10.11.19", "11.0.0"}, + wantLatestIdx: 1, + }, + { + name: "stable beats same-X.Y.Z RC", + versions: []string{"11.7.0-rc3", "11.7.0"}, + wantLatestIdx: 1, + }, + { + name: "rc.10 > rc.2 (no string compare)", + versions: []string{"11.8.0-rc2", "11.8.0-rc10"}, + wantLatestIdx: 1, + }, + { + name: "single version is latest", + versions: []string{"11.7.2"}, + wantLatestIdx: 0, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + instances := make([]*E2EInstance, len(tc.versions)) + for i := range tc.versions { + instances[i] = &E2EInstance{URL: fmt.Sprintf("https://v%d.example.com", i)} + } + jsonStr, err := buildMobileCMTMatrixJSON(tc.versions, instances) + require.NoError(t, err) + var matrix map[string]interface{} + require.NoError(t, json.Unmarshal([]byte(jsonStr), &matrix)) + servers := matrix["server"].([]interface{}) + require.Len(t, servers, len(tc.versions)) + for i, raw := range servers { + s := raw.(map[string]interface{}) + _, has := s["latest"] + if i == tc.wantLatestIdx { + assert.Equal(t, true, s["latest"], "index %d (%q) should be latest", i, tc.versions[i]) + } else { + assert.False(t, has, "index %d (%q) must not carry latest", i, tc.versions[i]) + } + } + }) + } + }) + + t.Run("mobile CMT: all unparseable versions => no latest marker", func(t *testing.T) { + // Defensive: if the resolved set somehow contains no parseable versions, leave the + // matrix unmarked. The workflow falls back to its default (smoke for all). + versions := []string{"junk", "also-junk"} + instances := []*E2EInstance{ + {URL: "https://a.example.com"}, + {URL: "https://b.example.com"}, + } + jsonStr, err := buildMobileCMTMatrixJSON(versions, instances) + require.NoError(t, err) + var matrix map[string]interface{} + require.NoError(t, json.Unmarshal([]byte(jsonStr), &matrix)) + for _, raw := range matrix["server"].([]interface{}) { + _, has := raw.(map[string]interface{})["latest"] + assert.False(t, has, "no entry should be latest when all versions are unparseable") + } + }) } // ------------------------------------------------------------ @@ -809,6 +904,39 @@ func TestDryRun_InstanceTracking(t *testing.T) { }) } +// ------------------------------------------------------------ +// 9b. SHA-scoped cleanup must not reap a concurrent flow on the same SHA +// ------------------------------------------------------------ + +func TestInstanceKeyMatchesSHA(t *testing.T) { + repo := "mattermost-mobile" + sha := "deadbeef" + cmtKey := fmt.Sprintf("%s-cmt-100-%s", repo, sha) // CMT flow + nightlyKey := fmt.Sprintf("%s-scheduled-200-%s", repo, sha) // nightly flow, same SHA + pushKey := fmt.Sprintf("%s-push-release-9.0-%s", repo, sha) // push flow, same SHA + prKey := fmt.Sprintf("%s-pr-42", repo) // PR flow, no -sha suffix + otherSHAKey := fmt.Sprintf("%s-cmt-100-%s", repo, "feedface") // CMT, different SHA + + t.Run("CMT completion matches only the CMT key", func(t *testing.T) { + assert.True(t, instanceKeyMatchesSHA(cmtKey, repo, sha, true)) + assert.False(t, instanceKeyMatchesSHA(nightlyKey, repo, sha, true), "nightly key must survive a CMT completion") + assert.False(t, instanceKeyMatchesSHA(pushKey, repo, sha, true)) + assert.False(t, instanceKeyMatchesSHA(prKey, repo, sha, true)) + assert.False(t, instanceKeyMatchesSHA(otherSHAKey, repo, sha, true), "different SHA must not match") + }) + + t.Run("non-CMT completion matches push/scheduled but not CMT", func(t *testing.T) { + assert.False(t, instanceKeyMatchesSHA(cmtKey, repo, sha, false), "CMT key must survive a nightly/push completion") + assert.True(t, instanceKeyMatchesSHA(nightlyKey, repo, sha, false)) + assert.True(t, instanceKeyMatchesSHA(pushKey, repo, sha, false)) + assert.False(t, instanceKeyMatchesSHA(prKey, repo, sha, false), "PR keys have no -sha suffix") + }) + + t.Run("other repo is never matched", func(t *testing.T) { + assert.False(t, instanceKeyMatchesSHA("mattermost-desktop-cmt-100-"+sha, repo, sha, true)) + }) +} + // ------------------------------------------------------------ // 10. Instance name length safety // ------------------------------------------------------------ @@ -1374,13 +1502,295 @@ func TestDryRun_CMTVersionNormalization(t *testing.T) { assert.Equal(t, "11.1.0", s1["version"]) }) - t.Run("CMT versions capped at 5", func(t *testing.T) { - input := "v1.0.0, v2.0.0, v3.0.0, v4.0.0, v5.0.0, v6.0.0, v7.0.0" + t.Run("CMT versions capped at 10", func(t *testing.T) { + input := "v1.0.0, v2.0.0, v3.0.0, v4.0.0, v5.0.0, v6.0.0, v7.0.0, v8.0.0, v9.0.0, v10.0.0, v11.0.0, v12.0.0" parsed := parseServerVersionsFromString(input) - const maxVersions = 5 + const maxVersions = 10 if len(parsed) > maxVersions { parsed = parsed[:maxVersions] } - assert.Len(t, parsed, 5, "CMT versions must be capped at 5") + assert.Len(t, parsed, 10, "CMT versions must be capped at 10") + }) +} + +// ------------------------------------------------------------ +// 15. resolveCMTServerVersions() — auto-derived CMT version set +// ------------------------------------------------------------ + +func TestDryRun_ResolveCMTServerVersions(t *testing.T) { + // A realistic releases payload (newest first): an upcoming RC, recent stable minors, + // and ESR lines flagged in the body. Includes multiple patches per line and a draft. + releasesBody := `[ + {"tag_name":"v11.8.0-rc3","draft":false,"prerelease":true,"body":"Mattermost Platform Release 11.8.0-rc3"}, + {"tag_name":"v11.8.0-rc2","draft":false,"prerelease":true,"body":"rc"}, + {"tag_name":"v11.7.2","draft":false,"prerelease":false,"body":"Mattermost Platform Extended Support Release 11.7.2 contains fixes."}, + {"tag_name":"v11.7.1","draft":false,"prerelease":false,"body":"Mattermost Platform Extended Support Release 11.7.1"}, + {"tag_name":"v11.6.4","draft":false,"prerelease":false,"body":"Mattermost Platform Release 11.6.4"}, + {"tag_name":"v11.6.3","draft":false,"prerelease":false,"body":"Mattermost Platform Release 11.6.3"}, + {"tag_name":"v11.5.7","draft":false,"prerelease":false,"body":"Mattermost Platform Release 11.5.7"}, + {"tag_name":"v11.99.0","draft":true,"prerelease":false,"body":"draft should be ignored"}, + {"tag_name":"v10.11.19","draft":false,"prerelease":false,"body":"Mattermost Platform Extended Support Release 10.11.19 contains security fixes."}, + {"tag_name":"v10.11.18","draft":false,"prerelease":false,"body":"Mattermost Platform Extended Support Release 10.11.18"} + ]` + + t.Run("auto-derives ESR + latest 3 minors + current RC, latest patch each", func(t *testing.T) { + srv := mockReleasesServer(t, releasesBody, http.StatusOK) + s := newDryRunServer(t, "", "mattermost") + s.githubAPIBase = srv.URL + "/" + + got := s.resolveCMTServerVersions() + // 10.11.19 (ESR) + 11.5.7/11.6.4/11.7.2 (latest 3 minors; 11.7 also ESR) + 11.8.0-rc3 (RC), + // latest patch per line, v-stripped, ascending. + assert.Equal(t, []string{"10.11.19", "11.5.7", "11.6.4", "11.7.2", "11.8.0-rc3"}, got) + }) + + t.Run("explicit config override is returned verbatim, no API call", func(t *testing.T) { + called := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(srv.Close) + s := newDryRunServer(t, "", "mattermost") + s.githubAPIBase = srv.URL + "/" + s.Config.CMTServerVersions = []string{"9.11.0", "10.5.0"} + + assert.Equal(t, []string{"9.11.0", "10.5.0"}, s.cmtServerVersions()) + assert.False(t, called, "manual override must not hit the GitHub API") + }) + + t.Run("API error falls back to defaultCMTServerVersions", func(t *testing.T) { + srv := mockReleasesServer(t, "boom", http.StatusInternalServerError) + s := newDryRunServer(t, "", "mattermost") + s.githubAPIBase = srv.URL + "/" + + assert.Equal(t, defaultCMTServerVersions, s.resolveCMTServerVersions()) + }) + + t.Run("RC omitted when not newer than latest stable", func(t *testing.T) { + // Only stable releases here; an old RC for an already-released line must not appear. + body := `[ + {"tag_name":"v11.7.2","draft":false,"prerelease":false,"body":"Mattermost Platform Extended Support Release 11.7.2"}, + {"tag_name":"v11.7.0-rc1","draft":false,"prerelease":true,"body":"rc"}, + {"tag_name":"v11.6.4","draft":false,"prerelease":false,"body":"Mattermost Platform Release 11.6.4"}, + {"tag_name":"v11.5.7","draft":false,"prerelease":false,"body":"Mattermost Platform Release 11.5.7"} + ]` + srv := mockReleasesServer(t, body, http.StatusOK) + s := newDryRunServer(t, "", "mattermost") + s.githubAPIBase = srv.URL + "/" + + got := s.resolveCMTServerVersions() + assert.Equal(t, []string{"11.5.7", "11.6.4", "11.7.2"}, got, "stale RC must be excluded") + }) + + t.Run("parseCMTVersion handles stable, rc, and v-prefix; rejects junk", func(t *testing.T) { + v, ok := parseCMTVersion("v11.8.0-rc3") + assert.True(t, ok) + assert.Equal(t, "11.8.0-rc3", v.raw) + assert.Equal(t, 3, v.rc) + v2, ok2 := parseCMTVersion("10.11.19") + assert.True(t, ok2) + assert.Equal(t, 0, v2.rc) + _, ok3 := parseCMTVersion("v11.7.0-beta.1") + assert.False(t, ok3, "non-rc prerelease suffixes are not CMT versions") + _, ok4 := parseCMTVersion("not-a-version") + assert.False(t, ok4) + // stable sorts above its rc for the same X.Y.Z + assert.True(t, v.less(v2) == false) + }) +} + +// TestResolveBranchHeadSHA verifies the dispatch-time HEAD resolution used to key non-PR +// cleanup. Non-PR flows dispatch the test workflow with ref=branch, so the run's head_sha is +// the branch HEAD at dispatch time. We key cleanup on that resolved SHA (not the trigger SHA) +// so findAndDestroyInstancesBySHA matches when the run completes. +func TestResolveBranchHeadSHA(t *testing.T) { + t.Run("returns the branch HEAD sha from the commits API", func(t *testing.T) { + var gotPath string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotPath = r.URL.Path + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"sha":"abc123def456"}`)) + })) + t.Cleanup(srv.Close) + + s := newDryRunServer(t, "", "mattermost") + s.githubAPIBase = srv.URL + "/" + + sha, err := s.resolveBranchHeadSHA("mattermost", "desktop", "release-12.0") + assert.NoError(t, err) + assert.Equal(t, "abc123def456", sha) + assert.Equal(t, "/repos/mattermost/desktop/commits/release-12.0", gotPath) + }) + + t.Run("errors on non-2xx so caller can fall back to trigger sha", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + })) + t.Cleanup(srv.Close) + + s := newDryRunServer(t, "", "mattermost") + s.githubAPIBase = srv.URL + "/" + + _, err := s.resolveBranchHeadSHA("mattermost", "desktop", "no-such-branch") + assert.Error(t, err) + }) + + t.Run("errors on empty sha so caller can fall back to trigger sha", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"sha":""}`)) + })) + t.Cleanup(srv.Close) + + s := newDryRunServer(t, "", "mattermost") + s.githubAPIBase = srv.URL + "/" + + _, err := s.resolveBranchHeadSHA("mattermost", "desktop", "main") + assert.Error(t, err) + }) +} + +// TestE2EPRInstanceMaxAge verifies the PR max-age knob: configured value wins, else 24h default. +func TestE2EPRInstanceMaxAge(t *testing.T) { + s := newDryRunServer(t, "", "mattermost") + + s.Config.E2EPRInstanceMaxAge = 0 + assert.Equal(t, 24*time.Hour, s.e2ePRInstanceMaxAge(), "0 should fall back to 24h default") + + s.Config.E2EPRInstanceMaxAge = 48 + assert.Equal(t, 48*time.Hour, s.e2ePRInstanceMaxAge(), "configured value should win") +} + +// TestEvictReapedPRInstances verifies that when the periodic scan reaps a PR's servers, the +// in-memory tracking entry is removed so the next E2E/Run provisions a fresh set rather than +// reusing now-deleted servers. Non-PR (SHA-keyed) entries must be left untouched. +func TestEvictReapedPRInstances(t *testing.T) { + t.Run("evicts the PR key when any of its instances was reaped", func(t *testing.T) { + s := newDryRunServer(t, "", "mattermost") + s.e2eInstances["desktop-pr-42"] = []*E2EInstance{ + {InstallationID: "inst-a", Platform: "linux"}, + {InstallationID: "inst-b", Platform: "macos"}, + {InstallationID: "inst-c", Platform: "windows"}, + } + + // Only one member reaped, but the whole set ages out together, so the key goes. + s.evictReapedPRInstances([]string{"inst-b"}, s.Logger) + + _, ok := s.e2eInstances["desktop-pr-42"] + assert.False(t, ok, "PR key must be evicted so re-applying E2E/Run creates a fresh set") + }) + + t.Run("leaves unrelated PR keys and non-PR (SHA-keyed) entries intact", func(t *testing.T) { + s := newDryRunServer(t, "", "mattermost") + s.e2eInstances["desktop-pr-42"] = []*E2EInstance{{InstallationID: "inst-a"}} + s.e2eInstances["mattermost-mobile-pr-9"] = []*E2EInstance{{InstallationID: "inst-x"}} + s.e2eInstances["desktop-cmt-555-deadbeef"] = []*E2EInstance{{InstallationID: "inst-cmt"}} + + s.evictReapedPRInstances([]string{"inst-a"}, s.Logger) + + _, gone := s.e2eInstances["desktop-pr-42"] + assert.False(t, gone, "the matching PR key is evicted") + _, otherPR := s.e2eInstances["mattermost-mobile-pr-9"] + assert.True(t, otherPR, "an unrelated PR key must remain") + _, cmt := s.e2eInstances["desktop-cmt-555-deadbeef"] + assert.True(t, cmt, "a non-PR (SHA-keyed) entry must remain") + }) + + t.Run("no reaped IDs is a no-op", func(t *testing.T) { + s := newDryRunServer(t, "", "mattermost") + s.e2eInstances["desktop-pr-42"] = []*E2EInstance{{InstallationID: "inst-a"}} + + s.evictReapedPRInstances(nil, s.Logger) + + _, ok := s.e2eInstances["desktop-pr-42"] + assert.True(t, ok, "nothing reaped means nothing evicted") }) } + +// TestShouldTriggerCMT verifies CMT gating across all sources: manual dispatch (any ref), RC +// tag cut (the new primary trigger), and release branch (defense-in-depth). Anything else — +// feature branches, GA tags, nightly tags, beta tags, default branch — must be rejected so +// that mobile's `on: push tags` glob slips and stray runs don't burn the multi-version matrix. +func TestShouldTriggerCMT(t *testing.T) { + s := newDryRunServer(t, "", "mattermost") + + // Manual dispatch always runs, regardless of ref. + assert.True(t, s.shouldTriggerCMT("workflow_dispatch", "main")) + assert.True(t, s.shouldTriggerCMT("workflow_dispatch", "v6.2.0-rc.1")) + assert.True(t, s.shouldTriggerCMT("workflow_dispatch", "release-6.2")) + + // RC tag cut (primary trigger). For tag pushes head_branch is the tag name. + assert.True(t, s.shouldTriggerCMT("push", "v6.2.0-rc.1")) // desktop convention + assert.True(t, s.shouldTriggerCMT("push", "v2.41.0-rc.1")) // future mobile + assert.True(t, s.shouldTriggerCMT("push", "v6.2.0-rc.10")) // multi-digit rc + assert.True(t, s.shouldTriggerCMT("push", "6.2.0-rc.1")) // missing 'v' prefix is permitted + assert.True(t, s.shouldTriggerCMT("push", "v6.2.0-rc1")) // no separator before number + + // Release branch (defense-in-depth — kept for backwards compat / manual triggers). + assert.True(t, s.shouldTriggerCMT("push", "release-6.2")) + + // Must NOT trigger: GA tags, betas, nightly tags, feature branches, default branch. + assert.False(t, s.shouldTriggerCMT("push", "v6.2.0")) // GA tag — no -rc + assert.False(t, s.shouldTriggerCMT("push", "v1.0.22-beta")) // pre-release but not RC + assert.False(t, s.shouldTriggerCMT("push", "6.3.0-nightly.20260601")) // nightly tag + assert.False(t, s.shouldTriggerCMT("push", "v6.2.0-rcabc")) // -rc but no number + assert.False(t, s.shouldTriggerCMT("create", "feature/cool-thing")) + assert.False(t, s.shouldTriggerCMT("push", "main")) + assert.False(t, s.shouldTriggerCMT("schedule", "main")) + + // Mobile build-release-NNN branch push (mobile's RC-cut equivalent). + assert.True(t, s.shouldTriggerCMT("push", "build-release-786")) // 3-digit + assert.True(t, s.shouldTriggerCMT("push", "build-release-1100")) // 4-digit + assert.True(t, s.shouldTriggerCMT("push", "build-release-12345")) // future 5-digit + assert.False(t, s.shouldTriggerCMT("push", "build-release-1")) // < 3 digits, likely a test + assert.False(t, s.shouldTriggerCMT("push", "build-release-ios-707")) // platform variant +} + +// TestIsRCTag covers the RC-tag regex in isolation so the boundary cases stay locked in. +func TestIsRCTag(t *testing.T) { + for _, ref := range []string{"v6.2.0-rc.1", "v6.2.0-rc.10", "v2.41.0-rc.2", "6.2.0-rc.1", "v6.2.0-rc1", "v6.2.0-rc-1"} { + assert.True(t, isRCTag(ref), "expected RC tag: %q", ref) + } + for _, ref := range []string{ + "v6.2.0", // GA + "v6.2.0-rc", // missing number + "v6.2.0-rcabc", // letters after -rc + "v1.0.22-beta", // not RC + "6.3.0-nightly.20260601", // nightly + "release-6.2", // branch + "main", + "", + } { + assert.False(t, isRCTag(ref), "must not match: %q", ref) + } +} + +// TestIsBuildReleaseBranch locks in the boundaries for mobile's build-release-NNN gate: +// 3+ digits required (rejects test artifacts like build-release-1); platform-specific +// variants (build-release-ios-NNN etc.) are rejected; no overlap with the RC-tag or +// release-* gates. +func TestIsBuildReleaseBranch(t *testing.T) { + for _, ref := range []string{ + "build-release-786", // real 3-digit + "build-release-1100", // real 4-digit + "build-release-12345", // future 5-digit + } { + assert.True(t, isBuildReleaseBranch(ref), "expected build-release branch: %q", ref) + } + for _, ref := range []string{ + "build-release-1", // 1 digit — test artifact / typo + "build-release-99", // 2 digit — below convention + "build-release-ios-707", // platform-specific + "build-release-sim-707", // simulator-only + "build-release-android-1100", // android-specific + "build-release-786-rc1", // stray suffix + "build-release-", // no number + "build-release-abc", // non-numeric + "release-2.41", // handled by isReleaseBranch + "v2.41.0-rc.1", // handled by isRCTag + "", + } { + assert.False(t, isBuildReleaseBranch(ref), "must not match: %q", ref) + } +} diff --git a/server/e2e_tests.go b/server/e2e_tests.go index f0efae3..aadf90b 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -9,6 +9,8 @@ import ( "fmt" "net/url" "os" + "sort" + "strconv" "strings" "sync" "time" @@ -701,14 +703,38 @@ func (s *Server) e2eInstanceMaxAge() time.Duration { return 3 * time.Hour } -// PR instances (identified by "-pr-" in their OwnerID) are always skipped — handleE2ECleanup -// on PR close manages their lifecycle via cloud-API orphan scan. -func (s *Server) cleanupStaleNonPRE2EInstances() { - maxAge := s.e2eInstanceMaxAge() +// e2ePRInstanceMaxAge returns the maximum age a PR E2E instance may reach before the periodic +// scan deletes it. PR instances are reused across label toggles and commits, so this is much +// longer than e2eInstanceMaxAge. Falls back to 24 hours when the config value is 0 (unset). +func (s *Server) e2ePRInstanceMaxAge() time.Duration { + if s.Config.E2EPRInstanceMaxAge > 0 { + return time.Duration(s.Config.E2EPRInstanceMaxAge) * time.Hour + } + return 24 * time.Hour +} + +// cleanupStaleE2EInstances destroys aged-out E2E instances of both kinds: +// - non-PR instances (CMT/nightly/push) past e2eInstanceMaxAge — backstop for the primary +// SHA-matched completion cleanup. +// - PR instances past e2ePRInstanceMaxAge — PR instances have no completion-based teardown +// (they are deliberately kept alive for reuse), so this age cap stops long-open PRs from +// accumulating servers indefinitely. When a PR instance is reaped, its in-memory tracking +// entry is also evicted so re-applying E2E/Run provisions a fresh set instead of dispatching +// against a now-deleted server. +func (s *Server) cleanupStaleE2EInstances() { + nonPRMaxAge := s.e2eInstanceMaxAge() + prMaxAge := s.e2ePRInstanceMaxAge() logger := s.Logger.WithField("type", "periodic_e2e_cleanup") - logger.WithField("max_age_hours", maxAge.Hours()).Info("Scanning for stale non-PR E2E instances") + logger.WithFields(logrus.Fields{ + "non_pr_max_age_hours": nonPRMaxAge.Hours(), + "pr_max_age_hours": prMaxAge.Hours(), + }).Info("Scanning for stale E2E instances") + + now := time.Now() + nonPRCutoffMs := now.Add(-nonPRMaxAge).UnixMilli() + prCutoffMs := now.Add(-prMaxAge).UnixMilli() - cutoffMs := time.Now().Add(-maxAge).UnixMilli() + var reapedPRInstallationIDs []string for _, instanceType := range []string{"desktop", "mobile"} { pattern := instanceType + "-%" @@ -729,18 +755,22 @@ func (s *Server) cleanupStaleNonPRE2EInstances() { continue } - // PR instances have "-pr-" in their OwnerID (e.g. "mobile-pr-123-site-1-..."). - // Skip them — handleE2ECleanup on PR close manages their lifecycle. - if strings.Contains(inst.OwnerID, "-pr-") { - continue + // PR instances have "-pr-" in their OwnerID (e.g. "mobile-pr-123-site-1-...") + // and use the longer PR max-age; everything else uses the non-PR max-age. + isPR := strings.Contains(inst.OwnerID, "-pr-") + cutoffMs := nonPRCutoffMs + if isPR { + cutoffMs = prCutoffMs } - // Skip instances younger than maxAge — a test may still be using them. + // Skip instances younger than their max age — a test may still be using them, + // or (for PRs) the servers are being kept alive for reuse. if inst.CreateAt > cutoffMs { logger.WithFields(logrus.Fields{ "installation_id": inst.ID, "owner_id": inst.OwnerID, - }).Debug("Skipping non-PR instance younger than max age (may still be in use)") + "is_pr": isPR, + }).Debug("Skipping instance younger than its max age") continue } @@ -758,15 +788,54 @@ func (s *Server) cleanupStaleNonPRE2EInstances() { "installation_id": inst.ID, "owner_id": inst.OwnerID, "state": inst.State, + "is_pr": isPR, }) - instLogger.Warn("Destroying stale non-PR E2E instance") + instLogger.Warn("Destroying stale E2E instance") if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil { - instLogger.WithError(err).Error("Failed to destroy stale non-PR E2E instance") + instLogger.WithError(err).Error("Failed to destroy stale E2E instance") + continue + } + if isPR { + reapedPRInstallationIDs = append(reapedPRInstallationIDs, inst.ID) } } } - logger.Info("Non-PR E2E instance cleanup scan complete") + // Evict in-memory PR tracking entries whose servers were just reaped so the reuse path + // in handleE2ETestRequest sees no live instances and creates a fresh set on the next + // E2E/Run, instead of dispatching a workflow against deleted servers. + if len(reapedPRInstallationIDs) > 0 { + s.evictReapedPRInstances(reapedPRInstallationIDs, logger) + } + + logger.Info("E2E instance cleanup scan complete") +} + +// evictReapedPRInstances removes PR tracking entries from the in-memory map when any of their +// servers were deleted by the periodic age scan. A PR's instances share a creation time, so the +// whole set ages out together; removing the entry when any member is reaped ensures the reuse +// path never hands back a partially-deleted set. +func (s *Server) evictReapedPRInstances(reapedInstallationIDs []string, logger logrus.FieldLogger) { + reaped := make(map[string]bool, len(reapedInstallationIDs)) + for _, id := range reapedInstallationIDs { + reaped[id] = true + } + + s.e2eInstancesLock.Lock() + defer s.e2eInstancesLock.Unlock() + for key, instances := range s.e2eInstances { + // PR tracking keys are "{repo}-pr-{number}"; non-PR keys are keyed by SHA elsewhere. + if !strings.Contains(key, "-pr-") { + continue + } + for _, inst := range instances { + if inst != nil && reaped[inst.InstallationID] { + delete(s.e2eInstances, key) + logger.WithField("key", key).Info("Evicted expired PR E2E instances from tracking map") + break + } + } + } } // resolveE2EServerVersion returns the server version for E2E provisioning. @@ -847,6 +916,221 @@ func (s *Server) resolveE2EServerVersion() string { return "master" } +// resolveBranchHeadSHA returns the current HEAD commit SHA of branch via the GitHub API. +// +// Non-PR E2E flows (push, nightly, CMT) provision instances and then dispatch the test +// workflow with `ref: ` (workflow_dispatch cannot take a commit SHA). GitHub records +// the dispatched run's head_sha as the branch HEAD at dispatch time, which can differ from the +// SHA that triggered provisioning if the branch advanced during the (often long) provisioning +// window. Cleanup keys instances by SHA and matches on the completed run's head_sha, so we key +// on the branch HEAD resolved at dispatch time to keep them aligned. Callers fall back to the +// trigger SHA on error (the periodic age-based scan remains the backstop either way). +func (s *Server) resolveBranchHeadSHA(owner, repoName, branch string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + client := newGithubClient(s.Config.GithubAccessToken) + if s.githubAPIBase != "" { + if baseURL, parseErr := url.Parse(s.githubAPIBase); parseErr == nil { + client.BaseURL = baseURL + } + } + + var commit struct { + SHA string `json:"sha"` + } + req, err := client.NewRequest("GET", fmt.Sprintf("/repos/%s/%s/commits/%s", owner, repoName, branch), nil) + if err != nil { + return "", err + } + if _, err := client.Do(ctx, req, &commit); err != nil { + return "", err + } + if commit.SHA == "" { + return "", fmt.Errorf("empty SHA for %s/%s@%s", owner, repoName, branch) + } + return commit.SHA, nil +} + +// cmtVersion is a parsed Mattermost release version: major.minor.patch with an optional +// release-candidate number. raw is the bare-semver string passed to the cloud provisioner +// (e.g. "11.7.1" or "11.8.0-rc3"). +type cmtVersion struct { + major, minor, patch int + rc int // 0 = stable, >0 = -rcN + raw string +} + +// parseCMTVersion parses "vX.Y.Z" or "vX.Y.Z-rcN" (the leading "v" is optional). It returns +// ok=false for anything else (other prerelease suffixes like -beta/-alpha are ignored for CMT). +func parseCMTVersion(tag string) (cmtVersion, bool) { + raw := strings.TrimPrefix(strings.TrimSpace(tag), "v") + base := raw + rc := 0 + if i := strings.Index(base, "-rc"); i != -1 { + n, err := strconv.Atoi(base[i+len("-rc"):]) + if err != nil { + return cmtVersion{}, false + } + rc = n + base = base[:i] + } else if strings.Contains(base, "-") { + return cmtVersion{}, false + } + parts := strings.Split(base, ".") + if len(parts) != 3 { + return cmtVersion{}, false + } + maj, err1 := strconv.Atoi(parts[0]) + min, err2 := strconv.Atoi(parts[1]) + pat, err3 := strconv.Atoi(parts[2]) + if err1 != nil || err2 != nil || err3 != nil { + return cmtVersion{}, false + } + return cmtVersion{major: maj, minor: min, patch: pat, rc: rc, raw: raw}, true +} + +// less reports whether a sorts before b by (major, minor, patch, rc). For the same X.Y.Z, a +// stable release (rc==0) sorts above its release candidates (e.g. 11.8.0-rc3 < 11.8.0). +func (a cmtVersion) less(b cmtVersion) bool { + if a.major != b.major { + return a.major < b.major + } + if a.minor != b.minor { + return a.minor < b.minor + } + if a.patch != b.patch { + return a.patch < b.patch + } + ar, br := a.rc, b.rc + if ar == 0 { + ar = int(^uint(0) >> 1) // treat stable as the highest "rc" for the same patch + } + if br == 0 { + br = int(^uint(0) >> 1) + } + return ar < br +} + +// cmtServerVersions returns the version set CMT runs against. An explicit, non-empty +// Config.CMTServerVersions is used verbatim (manual override / pin); otherwise the set is +// auto-derived from the Mattermost GitHub releases. +func (s *Server) cmtServerVersions() []string { + if len(s.Config.CMTServerVersions) > 0 { + return s.Config.CMTServerVersions + } + return s.resolveCMTServerVersions() +} + +// resolveCMTServerVersions auto-derives the CMT version set from the mattermost/mattermost +// GitHub releases: the active ESR line(s) + the latest 3 stable minor lines + the current +// release candidate, each as the latest patch of its line. ESR lines are detected from the +// release notes ("Extended Support Release"). It is called once per CMT trigger (rare: +// monthly schedule + occasional manual dispatch), so it fetches fresh with no caching. +// Falls back to defaultCMTServerVersions on error or if nothing parses. +func (s *Server) resolveCMTServerVersions() []string { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + client := newGithubClient(s.Config.GithubAccessToken) + // githubAPIBase is only set in tests to redirect to a mock server. + if s.githubAPIBase != "" { + if baseURL, parseErr := url.Parse(s.githubAPIBase); parseErr == nil { + client.BaseURL = baseURL + } + } + + var releases []struct { + TagName string `json:"tag_name"` + Draft bool `json:"draft"` + Prerelease bool `json:"prerelease"` + Body string `json:"body"` + } + req, err := client.NewRequest("GET", "/repos/mattermost/mattermost/releases?per_page=100", nil) + if err != nil { + s.Logger.WithError(err).Warn("[resolveCMTServerVersions] Failed to build request; using default CMT versions") + return defaultCMTServerVersions + } + if _, err = client.Do(ctx, req, &releases); err != nil { + s.Logger.WithError(err).Warn("[resolveCMTServerVersions] Failed to fetch releases; using default CMT versions") + return defaultCMTServerVersions + } + + type minorKey struct{ major, minor int } + latestStable := map[minorKey]cmtVersion{} + esrMinors := map[minorKey]bool{} + var bestRC cmtVersion + haveRC := false + + for _, r := range releases { + if r.Draft { + continue + } + v, ok := parseCMTVersion(r.TagName) + if !ok { + continue + } + key := minorKey{v.major, v.minor} + if v.rc > 0 { + if !haveRC || bestRC.less(v) { + bestRC = v + haveRC = true + } + continue + } + if cur, exists := latestStable[key]; !exists || cur.less(v) { + latestStable[key] = v + } + if strings.Contains(strings.ToLower(r.Body), "extended support release") { + esrMinors[key] = true + } + } + + if len(latestStable) == 0 { + s.Logger.Warn("[resolveCMTServerVersions] No stable releases parsed; using default CMT versions") + return defaultCMTServerVersions + } + + // All stable minor lines, sorted descending (newest first). + minors := make([]cmtVersion, 0, len(latestStable)) + for _, v := range latestStable { + minors = append(minors, v) + } + sort.Slice(minors, func(i, j int) bool { return minors[j].less(minors[i]) }) + + selected := map[minorKey]cmtVersion{} + for i := 0; i < len(minors) && i < 3; i++ { // latest 3 stable minor lines + selected[minorKey{minors[i].major, minors[i].minor}] = minors[i] + } + for k := range esrMinors { // active ESR line(s) + if v, ok := latestStable[k]; ok { + selected[k] = v + } + } + + chosen := make([]cmtVersion, 0, len(selected)+1) + for _, v := range selected { + chosen = append(chosen, v) + } + // Include the current RC only when it's newer than the newest stable (an upcoming release). + if haveRC && minors[0].less(bestRC) { + chosen = append(chosen, bestRC) + } + sort.Slice(chosen, func(i, j int) bool { return chosen[i].less(chosen[j]) }) // ascending + + const maxVersions = 10 + if len(chosen) > maxVersions { + chosen = chosen[len(chosen)-maxVersions:] // keep the newest if somehow over the cap + } + + versions := make([]string, 0, len(chosen)) + for _, v := range chosen { + versions = append(versions, v.raw) + } + s.Logger.WithField("versions", versions).Info("[resolveCMTServerVersions] Auto-derived CMT server version set") + return versions +} + // destroyE2EInstances destroys all given E2E instances func (s *Server) destroyE2EInstances(instances []*E2EInstance, logger logrus.FieldLogger) { for _, instance := range instances { @@ -1048,11 +1332,15 @@ func (s *Server) buildInstanceDetailsJSON(instances []*E2EInstance) (string, err return string(jsonBytes), nil } -// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow via GitHub Actions API. -// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed -// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do -// a direct key lookup instead of fragile SHA suffix matching. -func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType, trackingKey string, nightly bool) error { +// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow (e2e-functional.yml) via the +// GitHub Actions API. Cleanup is driven by the workflow_run completed event matched on the +// commit SHA, so no tracking key is passed as an input (e2e-functional.yml does not declare +// one, and GitHub rejects a workflow_dispatch carrying an undeclared input with a 422). +// +// `nightly` was previously sent on this dispatch when matterwick's nightly handler fired the +// desktop workflow. That handler is gone; matterwick never has reason to set nightly=true any +// more, and the downstream `nightly` input was removed from e2e-functional.yml. Don't send it. +func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType string) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) @@ -1083,10 +1371,6 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta "MM_TEST_USER_NAME": s.Config.E2EUsername, "MM_SERVER_VERSION": serverVersion, "run_type": runType, - "nightly": fmt.Sprintf("%t", nightly), - } - if trackingKey != "" { - workflowInputs["mw_tracking_key"] = trackingKey } // Use REST API to trigger workflow dispatch (v32 go-github compatibility) @@ -1116,11 +1400,11 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta return nil } -// dispatchMobileE2EWorkflow triggers the mobile E2E workflow via GitHub Actions API. -// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed -// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do -// a direct key lookup instead of fragile SHA suffix matching. -func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType, trackingKey string) error { +// dispatchMobileE2EWorkflow triggers the mobile E2E workflow (e2e-detox-pr.yml) via the +// GitHub Actions API. Cleanup is driven by the workflow_run completed event matched on the +// commit SHA, so no tracking key is passed as an input (e2e-detox-pr.yml does not declare +// one, and GitHub rejects a workflow_dispatch carrying an undeclared input with a 422). +func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType string) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) @@ -1138,9 +1422,6 @@ func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1U "PLATFORM": platform, "run_type": runType, } - if trackingKey != "" { - workflowInputs["mw_tracking_key"] = trackingKey - } // Use REST API to trigger workflow dispatch (v32 go-github compatibility) req, err := client.NewRequest("POST", diff --git a/server/push_events.go b/server/push_events.go index 685a269..67c192d 100644 --- a/server/push_events.go +++ b/server/push_events.go @@ -39,24 +39,29 @@ func (s *Server) handlePushEvent(event *github.PushEvent) { }) logger.Info("Push event received") - if s.Config.E2EAutoTriggerOnRelease && s.isReleaseBranch(branch) { - logger.WithField("type", "release_branch").Info("Release branch detected, triggering E2E tests") - go s.handlePushEventE2E(event, branch) - return - } + // Push-event auto-trigger reduced to desktop `master` only. + // - release-X.Y push: previously fired full PR-style E2E for both repos. Removed — + // desktop is gated by CMT-on-RC-tag (release team's go/no-go); mobile's per-cherry- + // pick E2E during stabilization ran into tens of thousands of $ per release cycle + // for a signal already covered by PR-label E2E pre-merge and CMT smoke at the RC + // build moment. + // - mobile `main` push: same cost calculus, every PR merge to main was one ~$27 run. + // Skipped here; PR-label E2E covers pre-merge. + // - desktop `master` push: still fires (lower merge cadence than mobile main; the + // per-commit regression signal earns its keep here). if s.Config.E2EAutoTriggerOnMaster && (branch == "master" || branch == "main") { + if strings.Contains(repoName, "mobile") { + logger.Info("Mobile main push: skipping E2E (per-commit cost too high; PR-label covers pre-merge, CMT covers RC builds)") + return + } logger.WithField("type", "master_main").Info("Master/main branch detected, triggering E2E tests") go s.handlePushEventE2E(event, branch) return } - logger.WithFields(logrus.Fields{ - "auto_release": s.Config.E2EAutoTriggerOnRelease, - "auto_master": s.Config.E2EAutoTriggerOnMaster, - "release_pattern_prefix": s.Config.E2EReleasePatternPrefix, - "is_release_branch": s.isReleaseBranch(branch), - }).Info("Push event does not match E2E trigger conditions") + logger.WithField("auto_master", s.Config.E2EAutoTriggerOnMaster). + Info("Push event does not match E2E trigger conditions") } // isReleaseBranch returns true if branch matches E2EReleasePatternPrefix. @@ -133,14 +138,26 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string) { logger.WithField("instanceCount", len(instances)).Info("E2E instances created successfully") + // Key on the branch HEAD resolved now (just before dispatch), not the push SHA: the + // dispatched (ref=branch) run reports its head_sha as the branch HEAD at dispatch time, + // which can differ from the push SHA if the branch advanced during provisioning. The key + // still ends with "-{sha}" so findAndDestroyInstancesBySHA matches it by suffix on + // completion. Fall back to the push SHA on error (the periodic scan remains the backstop). + cleanupSHA := sha + if resolved, resErr := s.resolveBranchHeadSHA(s.Config.Org, repoName, branch); resErr == nil && resolved != "" { + cleanupSHA = resolved + } else if resErr != nil { + logger.WithError(resErr).Warn("Failed to resolve branch HEAD SHA; keying cleanup on push SHA (periodic scan remains the backstop)") + } + // Store instances before dispatching so a fast-completing workflow_run event // doesn't race ahead and find nothing to clean up. - key := fmt.Sprintf("%s-push-%s-%s", repoName, branch, sha) + key := fmt.Sprintf("%s-push-%s-%s", repoName, branch, cleanupSHA) s.e2eInstancesLock.Lock() s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() - err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, key, instances) + err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, instances) if err != nil { logger.WithError(err).Error("Failed to trigger E2E workflow") s.e2eInstancesLock.Lock() @@ -249,8 +266,8 @@ func getRunnerForPlatform(platform string) string { } // triggerE2EWorkflowForPushEvent routes to the desktop or mobile dispatch function. -// trackingKey is embedded in workflow inputs as mw_tracking_key for cleanup on completion. -func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, trackingKey string, instances []*E2EInstance) error { +// Cleanup is driven by the workflow_run completed event matched on the commit SHA. +func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "instanceType": instanceType, @@ -265,14 +282,14 @@ func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, } if instanceType == "desktop" { - return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances) + return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances) } - return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances) + return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances) } // triggerDesktopE2EWorkflowForPushEvent dispatches the desktop E2E workflow. -func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error { +func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "branch": branch, @@ -286,16 +303,13 @@ func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, bran logger.WithField("instanceDetails", instanceDetailsJSON).Debug("Triggering desktop E2E workflow") - runType := "MASTER" - if s.isReleaseBranch(branch) { - runType = "RELEASE" - } - - return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, branch, sha, instanceDetailsJSON, runType, trackingKey, false) + // handlePushEvent only routes master/main pushes here (release-branch push trigger was + // removed), so runType is always MASTER for desktop push events. + return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, branch, sha, instanceDetailsJSON, "MASTER") } // triggerMobileE2EWorkflowForPushEvent dispatches the mobile E2E workflow (e2e-detox-pr.yml). -func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error { +func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "branch": branch, @@ -312,15 +326,14 @@ func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branc "site_3_url": instances[2].URL, }).Debug("Triggering mobile E2E workflow for push event") - runType := "MASTER" - if s.isReleaseBranch(branch) { - runType = "RELEASE" - } - + // handlePushEvent only routes master/main pushes here (release-branch push trigger was + // removed and mobile main push is explicitly skipped, so in practice mobile never + // reaches this code today; keeping the function intact in case mobile main push is + // re-enabled). runType is always MASTER for the events that would route here. return s.dispatchMobileE2EWorkflow( repoOwner, repoName, branch, sha, instances[0].URL, instances[1].URL, instances[2].URL, "both", // push events always test both iOS and Android - runType, trackingKey, + "MASTER", ) } diff --git a/server/server.go b/server/server.go index f1e5cdf..6312b81 100644 --- a/server/server.go +++ b/server/server.go @@ -134,12 +134,13 @@ func New(config *MatterwickConfig) *Server { func (s *Server) Start() { s.Logger.Info("Starting MatterWick Server") - // Destroy stale non-PR E2E instances left from a previous run immediately on startup, - // then continue scanning periodically so a mid-run restart doesn't leave orphaned - // instances alive until the *next* matterwick restart. - // The scan interval is half the configured max-age so the worst-case orphan lifetime - // is maxAge + interval ≈ 1.5× maxAge. - s.cleanupStaleNonPRE2EInstances() + // Destroy stale E2E instances (non-PR backstop + aged-out PR instances) left from a + // previous run immediately on startup, then continue scanning periodically so a mid-run + // restart doesn't leave orphaned instances alive until the *next* matterwick restart. + // The scan interval is half the (shorter) non-PR max-age so the worst-case non-PR orphan + // lifetime is maxAge + interval ≈ 1.5× maxAge; PR instances use a longer max-age but the + // same frequent scan, which is harmless. + s.cleanupStaleE2EInstances() go func() { interval := s.e2eInstanceMaxAge() / 2 if interval < 30*time.Minute { @@ -150,7 +151,7 @@ func (s *Server) Start() { for { select { case <-ticker.C: - s.cleanupStaleNonPRE2EInstances() + s.cleanupStaleE2EInstances() case <-s.stopCh: return } diff --git a/server/workflow_run.go b/server/workflow_run.go index eba97da..c335bd7 100644 --- a/server/workflow_run.go +++ b/server/workflow_run.go @@ -8,6 +8,7 @@ import ( "encoding/json" "fmt" "io" + "regexp" "strings" "sync" @@ -78,171 +79,62 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay "head_sha": headSHA, }) - // CMT: "CMT Provisioner" (user-dispatched) provisions servers; "Compatibility Matrix Testing" runs tests. - if strings.Contains(workflowName, "cmt") || strings.Contains(workflowName, "CMT") { - if payload.Action == "completed" { - logger.Debug("CMT trigger workflow completed; sha-based cleanup is primary") - s.handleCMTRunCleanup(repoName, headSHA, logger) - return - } - if payload.Action != "requested" { - logger.Debug("Ignoring CMT workflow action (not requested or completed)") - return - } - logger.Info("Processing CMT workflow_run event") - serverVersionsStr, ok := payload.WorkflowRun.Inputs["server_versions"] - if !ok || serverVersionsStr == "" { - logger.Error("No server_versions found in workflow inputs") - return - } - serverVersions := parseServerVersionsFromString(serverVersionsStr) - if len(serverVersions) == 0 { - logger.Error("Failed to parse server versions from workflow input") - return - } - logger.WithField("serverVersions", serverVersions).Info("Extracted server versions from workflow inputs") - var instanceType string - if strings.Contains(repoName, "desktop") { - instanceType = "desktop" - } else if strings.Contains(repoName, "mobile") { - instanceType = "mobile" - } else { - logger.Warn("Repository is neither desktop nor mobile, skipping CMT") - return - } - go s.handleCMTWithServerVersions(owner, repoName, instanceType, headBranch, headSHA, serverVersions, runID, logger) - return - } - - // Nightly: lightweight trigger workflow fires first; matterwick provisions instances and dispatches the real test workflow. - if s.Config.E2ENightlyTriggerWorkflowName != "" && workflowName == s.Config.E2ENightlyTriggerWorkflowName { + // CMT: a lightweight, scheduled trigger workflow (Config.CMTTriggerWorkflowName, e.g. + // "CMT Provisioner") fires in the desktop/mobile repo. matterwick provisions one instance + // per version in the resolved CMT set (s.cmtServerVersions) and dispatches + // compatibility-matrix-testing.yml. No inputs are read from the event — the version set is + // resolved by matterwick — so this works despite GitHub's workflow_run payload not carrying + // workflow_dispatch inputs. + // Cleanup happens when the test workflow ("Compatibility Matrix Testing") completes, handled + // by the isE2ETestWorkflow branch below. + if s.Config.CMTTriggerWorkflowName != "" && workflowName == s.Config.CMTTriggerWorkflowName { if payload.Action == "requested" { - logger.Info("Nightly trigger workflow started, provisioning E2E servers") - go s.handleNightlyE2ETrigger(owner, repoName, headBranch, headSHA, payload.WorkflowRun.Event, runID, logger) - } - return - } - - // --- Test workflow completion: clean up provisioned instances --- - if payload.Action == "completed" && s.isE2ETestWorkflow(workflowName) { - logger.Info("Test workflow completed, checking for instance cleanup") - - // Primary: look up by mw_tracking_key embedded at dispatch time (immune to SHA races). - if trackingKey := payload.WorkflowRun.Inputs["mw_tracking_key"]; trackingKey != "" { - s.e2eInstancesLock.Lock() - instances := s.e2eInstances[trackingKey] - delete(s.e2eInstances, trackingKey) - s.e2eInstancesLock.Unlock() - if len(instances) > 0 { - logger.WithField("tracking_key", trackingKey).Info("Destroying instances by tracking key") - s.destroyE2EInstances(instances, logger) + // Gate CMT to the cases we actually want. The trigger workflow fires on RC tag pushes + // (e.g. `v6.2.0-rc.1`) and on manual dispatch. For tag pushes the workflow_run + // payload's head_branch carries the *tag name*, not a branch. Provision CMT when: + // - the run was started manually (workflow_dispatch, any ref), OR + // - head_branch is an RC tag (isRCTag), OR + // - head_branch is a release branch (defense-in-depth for legacy/manual triggers). + triggerEvent := payload.WorkflowRun.Event + if s.shouldTriggerCMT(triggerEvent, headBranch) { + logger.WithFields(logrus.Fields{ + "trigger_event": triggerEvent, + "head_branch": headBranch, + }).Info("CMT trigger workflow started, provisioning E2E servers for configured versions") + go s.handleCMTTrigger(owner, repoName, headBranch, headSHA, runID, logger) } else { - logger.WithField("tracking_key", trackingKey).Debug("No in-memory instances for tracking key (matterwick restarted or already cleaned)") + logger.WithFields(logrus.Fields{ + "trigger_event": triggerEvent, + "head_branch": headBranch, + }).Info("CMT trigger fired on non-RC-tag, non-release ref and not via manual dispatch; skipping") } - return } - - // Fallback: SHA-based scan (runs dispatched before mw_tracking_key was introduced). - logger.Debug("No mw_tracking_key in workflow inputs, falling back to SHA-based instance cleanup") - s.findAndDestroyInstancesBySHA(repoName, headSHA, logger) return } - logger.WithFields(logrus.Fields{ - "configured_nightly_name": s.Config.E2ENightlyTriggerWorkflowName, - "configured_test_workflows": s.Config.E2ETestWorkflowNames, - }).Info("Ignoring workflow_run event (not relevant to E2E lifecycle)") -} - -// handleNightlyE2ETrigger provisions instances and dispatches the test workflow. -// Called when the E2E trigger workflow starts, whether from schedule, push to master/main, -// or push to a release branch. The triggerEvent parameter ("schedule", "push", etc.) is -// used to set runType correctly — scheduled runs always get "NIGHTLY" regardless of branch. -func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEvent string, runID int64, logger logrus.FieldLogger) { - logger = logger.WithFields(logrus.Fields{ - "branch": branch, - "sha": sha, - "run_id": runID, - }) - logger.Info("Provisioning nightly E2E instances") - - instanceType := "desktop" - if strings.Contains(repoName, "mobile") { - instanceType = "mobile" - } else if !strings.Contains(repoName, "desktop") { - logger.Warn("Repository is neither desktop nor mobile, skipping nightly E2E trigger") - return - } - - instances, err := s.createCMTInstancesForVersion(repoName, instanceType, s.resolveE2EServerVersion(), "nightly") - if err != nil { - logger.WithError(err).Error("Failed to create nightly E2E instances") - return - } - - // Include runID so two trigger runs against the same SHA (e.g. manual re-trigger) - // get separate tracking keys. The key still ends with "-{sha}" so - // findAndDestroyInstancesBySHA continues to match it by suffix. - key := fmt.Sprintf("%s-scheduled-%d-%s", repoName, runID, sha) - s.e2eInstancesLock.Lock() - s.e2eInstances[key] = instances - s.e2eInstancesLock.Unlock() - - logger.WithField("tracking_key", key).Info("Nightly instances tracked, dispatching test workflow") - - // Determine run classification. Scheduled runs are always NIGHTLY regardless of branch - // (a scheduled run on master must not be classified as MASTER). Push-triggered runs - // derive their type from the branch name. - runType := "NIGHTLY" - nightly := true - if triggerEvent != "schedule" { - if branch == "master" || branch == "main" { - runType = "MASTER" - nightly = false - } else if s.isReleaseBranch(branch) { - runType = "RELEASE" - nightly = false - } - } - - var dispatchErr error - if instanceType == "desktop" { - instanceDetailsJSON, err := s.buildInstanceDetailsJSON(instances) - if err != nil { - logger.WithError(err).Error("Failed to build instance details JSON for nightly desktop run") - s.e2eInstancesLock.Lock() - delete(s.e2eInstances, key) - s.e2eInstancesLock.Unlock() - s.destroyE2EInstances(instances, logger) - return - } - // Pass the tracking key so the workflow_run completed handler can clean up by - // direct key lookup rather than SHA suffix matching (immune to new commits during - // the ~30 min instance-creation window). - dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, branch, sha, instanceDetailsJSON, runType, key, nightly) - } else { - if len(instances) < 3 { - logger.Errorf("Expected 3 mobile instances, got %d", len(instances)) - s.e2eInstancesLock.Lock() - delete(s.e2eInstances, key) - s.e2eInstancesLock.Unlock() - s.destroyE2EInstances(instances, logger) - return - } - dispatchErr = s.dispatchMobileE2EWorkflow(owner, repoName, branch, sha, - instances[0].URL, instances[1].URL, instances[2].URL, "both", runType, key) - } - - if dispatchErr != nil { - logger.WithError(dispatchErr).Error("Failed to dispatch test workflow for nightly run; cleaning up instances") - s.e2eInstancesLock.Lock() - delete(s.e2eInstances, key) - s.e2eInstancesLock.Unlock() - s.destroyE2EInstances(instances, logger) + // --- Test workflow completion: clean up provisioned instances --- + // + // Cleanup is correlated on head_sha. GitHub's workflow_run webhook payload does not + // include workflow_dispatch inputs, and its run id is the *test* run's id (not the + // trigger run id embedded in our tracking key), so head_sha is the only field we can + // match on (push and CMT tracking keys all end with "-{sha}"). + // + // `cmtOnly` scopes cleanup to the completing workflow's flow (CMT vs. non-CMT). It's + // kept as defense-in-depth so a future case where two flows share a SHA (e.g. a manual + // CMT dispatch coinciding with a master push) can't tear down each other's servers. + // + // If the branch advances during provisioning, the dispatched run's head_sha can differ + // from the tracked SHA and no key will match here; such orphans are reaped by the + // periodic cleanupStaleE2EInstances scan (bounded by E2EInstanceMaxAge). + if payload.Action == "completed" && s.isE2ETestWorkflow(workflowName) { + cmtOnly := workflowName == cmtTestWorkflowName + logger.WithField("cmt_only", cmtOnly).Info("Test workflow completed, cleaning up matching instances by SHA") + s.findAndDestroyInstancesBySHA(repoName, headSHA, cmtOnly, logger) return } - logger.Info("Nightly E2E workflow dispatched successfully") + logger.WithField("configured_test_workflows", s.Config.E2ETestWorkflowNames). + Info("Ignoring workflow_run event (not relevant to E2E lifecycle)") } // isE2ETestWorkflow returns true if the workflow name is a configured E2E test workflow @@ -256,23 +148,42 @@ func (s *Server) isE2ETestWorkflow(name string) bool { return false } -// findAndDestroyInstancesBySHA scans the instance map for entries belonging to repoName -// whose tracking key ends with "-{headSHA}" (push-event, scheduled, and cmt keys) and destroys them. -func (s *Server) findAndDestroyInstancesBySHA(repoName, headSHA string, logger logrus.FieldLogger) { +// cmtTestWorkflowName is the workflow "name:" of compatibility-matrix-testing.yml in the +// desktop and mobile repos. It identifies CMT test-workflow completions so SHA-based cleanup +// only reaps CMT instances (keys "{repo}-cmt-…-{sha}") and leaves a concurrent nightly/push +// run on the same SHA untouched (and vice versa). +const cmtTestWorkflowName = "Compatibility Matrix Testing" + +// instanceKeyMatchesSHA reports whether a tracking key belongs to repoName, ends with the given +// head SHA, and matches the requested flow: CMT keys ("{repo}-cmt-…") when cmtOnly is true, +// non-CMT keys (push/scheduled) when false. This scoping prevents a completing workflow from +// reaping a different flow's run that happens to share the same commit SHA (e.g. the monthly CMT +// trigger and the nightly trigger firing on the same master commit). +func instanceKeyMatchesSHA(key, repoName, headSHA string, cmtOnly bool) bool { + if !strings.HasPrefix(key, repoName+"-") || !strings.HasSuffix(key, "-"+headSHA) { + return false + } + return strings.HasPrefix(key, repoName+"-cmt-") == cmtOnly +} + +// findAndDestroyInstancesBySHA scans the instance map for entries belonging to repoName whose +// tracking key ends with "-{headSHA}" and destroys them. When cmtOnly is true only CMT keys +// ("{repo}-cmt-…") match; when false only non-CMT keys (push/scheduled) match — so a completing +// workflow never tears down a different flow's run that happens to share the same SHA. +func (s *Server) findAndDestroyInstancesBySHA(repoName, headSHA string, cmtOnly bool, logger logrus.FieldLogger) { if headSHA == "" { return } - prefix := repoName + "-" - suffix := "-" + headSHA s.e2eInstancesLock.Lock() var found []*E2EInstance var keysToDelete []string for key, instances := range s.e2eInstances { - if strings.HasPrefix(key, prefix) && strings.HasSuffix(key, suffix) { - found = append(found, instances...) - keysToDelete = append(keysToDelete, key) + if !instanceKeyMatchesSHA(key, repoName, headSHA, cmtOnly) { + continue } + found = append(found, instances...) + keysToDelete = append(keysToDelete, key) } for _, k := range keysToDelete { delete(s.e2eInstances, k) @@ -298,11 +209,79 @@ func parseServerVersionsFromString(input string) []string { return versions } +// shouldTriggerCMT decides whether a CMT-trigger workflow_run should actually provision CMT. +// CMT runs only when: +// - the run was started manually (workflow_dispatch, any ref), OR +// - head_branch is an RC tag (desktop convention: v6.2.0-rc.1), OR +// - head_branch is a mobile release-build branch (build-release-NNN, 3+ digits), OR +// - head_branch is a release branch (defense-in-depth — legacy & for any manual ref). +// +// For tag-push events GitHub sets workflow_run.head_branch to the tag name (no refs/tags/ +// prefix), so an RC tag like "v6.2.0-rc.1" arrives in headBranch. +func (s *Server) shouldTriggerCMT(triggerEvent, headBranch string) bool { + return triggerEvent == "workflow_dispatch" || + isRCTag(headBranch) || + isBuildReleaseBranch(headBranch) || + s.isReleaseBranch(headBranch) +} + +// rcTagPattern matches RC tags like "v6.2.0-rc.1", "v2.41.0-rc.10", "6.2.0-rc.1". The leading +// "v" is optional, and the rc suffix can be "-rc.N", "-rcN", or "-rc-N" (we keep it permissive +// but require the literal "-rc" and a trailing number). Compiled once at package init. +var rcTagPattern = regexp.MustCompile(`^v?\d+\.\d+\.\d+-rc[.\-]?\d+$`) + +// isRCTag reports whether ref looks like a release-candidate tag we want to trigger CMT on. +// Excludes GA tags (v6.2.0), beta/alpha pre-releases (v1.0.22-beta), and nightly tags +// (6.3.0-nightly.20260601). +func isRCTag(ref string) bool { + return rcTagPattern.MatchString(ref) +} + +// buildReleaseBranchPattern matches mobile's release-build branches: "build-release-" followed +// by 3 or more digits (current convention is 3- or 4-digit build numbers like 786 / 1100). +// 3+ digits is intentional — rejects accidental test branches like "build-release-1" without +// locking an upper bound, so a future move to 5-digit build numbers needs no code change. +// Platform-specific variants (build-release-ios-NNN, build-release-sim-NNN, +// build-release-android-NNN) don't start with a digit after the "build-release-" prefix and +// are excluded. +var buildReleaseBranchPattern = regexp.MustCompile(`^build-release-\d{3,}$`) + +// isBuildReleaseBranch reports whether ref matches mobile's build-release-NNN convention. +// The branch is created by the "Mattermost Mobile Release" external workflow when an RC +// build is dispatched to TestFlight / Play Store, so it's mobile's equivalent of an RC tag +// cut. Used as a separate gate from isReleaseBranch because release-* fires on every +// cherry-pick / version-bump push, which would be far too noisy for CMT. +func isBuildReleaseBranch(ref string) bool { + return buildReleaseBranchPattern.MatchString(ref) +} + +// handleCMTTrigger is invoked when the scheduled CMT trigger workflow fires. It resolves the +// instance type from the repo, resolves the server-version set (auto-derived from Mattermost +// releases, or the manual Config.CMTServerVersions override), and hands off to +// handleCMTWithServerVersions. Nothing needs to be read from the workflow_run event. +func (s *Server) handleCMTTrigger(owner, repoName, branch, sha string, runID int64, logger logrus.FieldLogger) { + instanceType := "desktop" + if strings.Contains(repoName, "mobile") { + instanceType = "mobile" + } else if !strings.Contains(repoName, "desktop") { + logger.Warn("Repository is neither desktop nor mobile, skipping CMT trigger") + return + } + + versions := s.cmtServerVersions() + logger.WithFields(logrus.Fields{ + "instanceType": instanceType, + "versions": versions, + }).Info("Provisioning CMT instances for resolved server versions") + + s.handleCMTWithServerVersions(owner, repoName, instanceType, branch, sha, versions, runID, logger) +} + // handleCMTWithServerVersions orchestrates CMT testing: creates one instance per server // version, builds the CMT_MATRIX JSON, and dispatches compatibility-matrix-testing.yml once. func (s *Server) handleCMTWithServerVersions(repoOwner, repoName, instanceType, branch, sha string, serverVersions []string, runID int64, logger logrus.FieldLogger) { - // Cap at 5 versions to prevent runaway provisioning - const maxVersions = 5 + // Cap the number of versions (one cloud instance per version) to prevent runaway provisioning. + const maxVersions = 10 if len(serverVersions) > maxVersions { logger.Warnf("Capping server versions from %d to %d", len(serverVersions), maxVersions) serverVersions = serverVersions[:maxVersions] @@ -348,10 +327,18 @@ func (s *Server) handleCMTWithServerVersions(repoOwner, repoName, instanceType, logger.WithField("totalInstances", len(allInstances)).Info("CMT instances created, tracking for cleanup") - // Track by runID+sha: runID prevents collision when two dispatches share the same - // branch HEAD SHA; the key still ends with "-{sha}" so findAndDestroyInstancesBySHA - // can locate it when compatibility-matrix-testing.yml completes (hours later). - key := fmt.Sprintf("%s-cmt-%d-%s", repoName, runID, sha) + // Key on the branch HEAD resolved now (just before dispatch), not the trigger SHA: the + // dispatched (ref=branch) run reports its head_sha as the branch HEAD at dispatch time, + // which can differ from the trigger SHA after the long provisioning window. runID keeps + // the key unique; it still ends with "-{sha}" so findAndDestroyInstancesBySHA matches it + // when compatibility-matrix-testing.yml completes. + cleanupSHA := sha + if resolved, resErr := s.resolveBranchHeadSHA(repoOwner, repoName, branch); resErr == nil && resolved != "" { + cleanupSHA = resolved + } else if resErr != nil { + logger.WithError(resErr).Warn("Failed to resolve branch HEAD SHA; keying cleanup on trigger SHA (periodic scan remains the backstop)") + } + key := fmt.Sprintf("%s-cmt-%d-%s", repoName, runID, cleanupSHA) s.e2eInstancesLock.Lock() s.e2eInstances[key] = allInstances s.e2eInstancesLock.Unlock() @@ -373,9 +360,7 @@ func (s *Server) handleCMTWithServerVersions(repoOwner, repoName, instanceType, return } - // Pass the tracking key so the workflow_run completed handler can clean up by - // direct key lookup rather than SHA suffix matching. - if err := s.dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, key, runID, logger); err != nil { + if err := s.dispatchCMTWorkflow(repoOwner, repoName, branch, cmtMatrixJSON, instanceType, logger); err != nil { logger.WithError(err).Error("Failed to dispatch compatibility-matrix-testing.yml") s.e2eInstancesLock.Lock() delete(s.e2eInstances, key) @@ -402,10 +387,19 @@ func (s *Server) createSingleCMTInstance(repoName, instanceType, version string, return s.createCloudInstallation(context.Background(), name, version, username, password, instanceType, logger) } -// cmtServer is the server entry in CMT_MATRIX JSON. +// cmtServer is the server entry in CMT_MATRIX JSON. Mobile uses the optional `latest` flag +// to mark the highest-semver entry, so the mobile workflow can decide to run the whole suite +// against the latest server and smoke-only against older ones. Desktop ignores it and the +// `omitempty` keeps desktop's matrix JSON unchanged (the field is just absent there). +// +// IMPORTANT — layering: matterwick only signals which server is "latest". It does NOT decide +// what tests run for latest vs non-latest — that policy (smoke path vs whole suite path, +// parallelism, etc.) lives in the mobile workflow, where the test-directory structure is +// known. type cmtServer struct { Version string `json:"version"` URL string `json:"url"` + Latest bool `json:"latest,omitempty"` } // buildDesktopCMTMatrixJSON builds the CMT_MATRIX JSON for compatibility-matrix-testing.yml @@ -457,14 +451,17 @@ func buildDesktopCMTMatrixJSON(versions []string, instances []*E2EInstance) (str } // buildMobileCMTMatrixJSON builds the CMT_MATRIX JSON for compatibility-matrix-testing.yml -// in the mobile repo. One iOS test job is created per server version. +// in the mobile repo. One iOS test job is created per server version. The highest-semver +// entry is marked `latest: true` so the mobile workflow can branch on it (e.g. whole suite +// for latest, smoke for older). What "latest" implies test-wise is the workflow's policy, +// not matterwick's — see the cmtServer doc comment. // // Schema: // // { // "server": [ -// {"version": "v11.1.0", "url": "https://..."}, -// ... +// {"version": "10.11.18", "url": "https://..."}, +// {"version": "11.7.1", "url": "https://...", "latest": true} // ] // } func buildMobileCMTMatrixJSON(versions []string, instances []*E2EInstance) (string, error) { @@ -472,12 +469,36 @@ func buildMobileCMTMatrixJSON(versions []string, instances []*E2EInstance) (stri Server []cmtServer `json:"server"` } + // Find the highest-semver entry across versions that parse successfully. parseCMTVersion + // handles vX.Y.Z and vX.Y.Z-rcN (with or without the "v"). Unparseable versions are + // silently skipped — they just don't compete for "latest". If none parse, no entry is + // marked, and the workflow can fall back to its default (smoke for all). + latestIdx := -1 + var latestVer cmtVersion + for i, version := range versions { + if i >= len(instances) { + break + } + v, ok := parseCMTVersion(version) + if !ok { + continue + } + if latestIdx == -1 || latestVer.less(v) { + latestVer = v + latestIdx = i + } + } + var matrix mobileCMTMatrix for i, version := range versions { if i >= len(instances) { break } - matrix.Server = append(matrix.Server, cmtServer{Version: version, URL: instances[i].URL}) + entry := cmtServer{Version: version, URL: instances[i].URL} + if i == latestIdx { + entry.Latest = true + } + matrix.Server = append(matrix.Server, entry) } b, err := json.Marshal(matrix) @@ -488,19 +509,18 @@ func buildMobileCMTMatrixJSON(versions []string, instances []*E2EInstance) (stri } // dispatchCMTWorkflow dispatches compatibility-matrix-testing.yml with the populated -// CMT_MATRIX JSON. trackingKey is the s.e2eInstances map key for this run; it is -// embedded as "mw_tracking_key" in the workflow inputs so the workflow_run completed -// handler can do a direct key lookup instead of fragile SHA suffix matching. -// runID is the CMT provisioner workflow run ID, passed as cmt_run_id so the test workflow -// can call back to Matterwick for instance cleanup. -func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, trackingKey string, runID int64, logger logrus.FieldLogger) error { +// CMT_MATRIX JSON, targeting the branch ref (workflow_dispatch requires a branch/tag, not a SHA). +// +// We pass only the inputs the workflow declares (CMT_MATRIX + the per-platform version input). +// Passing an undeclared input (the old "mw_tracking_key" or "cmt_run_id") makes GitHub reject the +// dispatch with a 422. Cleanup is driven by SHA-suffix matching when the test workflow completes, +// so no run-id needs to be threaded through the workflow. +func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, branch, cmtMatrixJSON, instanceType string, logger logrus.FieldLogger) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) workflowInputs := map[string]interface{}{ - "CMT_MATRIX": cmtMatrixJSON, - "cmt_run_id": fmt.Sprintf("%d", runID), - "mw_tracking_key": trackingKey, + "CMT_MATRIX": cmtMatrixJSON, } if instanceType == "desktop" { workflowInputs["DESKTOP_VERSION"] = branch @@ -616,18 +636,3 @@ func (s *Server) createCMTInstancesForVersion(repoName, instanceType, version, p logger.WithField("instanceCount", len(instances)).Info("Instances created for version") return instances, nil } - -// handleCMTRunCleanup is a best-effort fallback for CMT cleanup when the trigger workflow -// completes. Because the CMT trigger is a lightweight workflow that completes in seconds — -// well before the 30-minute provisioning goroutine stores instances — this function will -// most often find nothing. The primary cleanup path is findAndDestroyInstancesBySHA, -// triggered when compatibility-matrix-testing.yml completes. -func (s *Server) handleCMTRunCleanup(repoName, sha string, logger logrus.FieldLogger) { - logger = logger.WithFields(logrus.Fields{ - "repo": repoName, - "sha": sha, - "type": "cmt_cleanup_fallback", - }) - logger.Debug("CMT trigger completed — sha-based cleanup is the primary path") - s.findAndDestroyInstancesBySHA(repoName, sha, logger) -}