diff --git a/.github/workflows/snapshot-sources-probe.yml b/.github/workflows/snapshot-sources-probe.yml new file mode 100644 index 00000000..3bf02041 --- /dev/null +++ b/.github/workflows/snapshot-sources-probe.yml @@ -0,0 +1,141 @@ +name: Snapshot Sources Probe + +# Weekly HEAD-check of every upstream snapshot mirror trond knows about. +# The structural follow-up to Task #161 — when an upstream rotates an IP +# or pulls a bucket (as the Nile S3 mirror did), the probe surfaces it +# here rather than in a confused user's bug report. +# +# Policy: open exactly one rolling issue while any source is unhealthy. +# Subsequent failures append a comment to that issue instead of spawning +# duplicates. When all sources go green again, the workflow closes the +# issue automatically. + +on: + schedule: + # Mondays 09:00 UTC. Early enough that whoever triages on Monday + # morning sees fresh results; late enough that European weekday + # cadence isn't disturbed if a flake comes through. + - cron: "0 9 * * 1" + workflow_dispatch: {} + +permissions: + contents: read + issues: write + +jobs: + probe: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: "1.25" + cache: true + + - name: Build trond + run: go build -o bin/trond ./ + + - name: Probe sources + id: probe + run: | + set +e + ./bin/trond snapshot sources --probe \ + --probe-timeout=10s \ + --probe-parallelism=4 \ + --stale-after=168h \ + --output json > probe.json + code=$? + echo "exit_code=$code" >> "$GITHUB_OUTPUT" + echo "--- text summary ---" + ./bin/trond snapshot sources --probe \ + --probe-timeout=10s \ + --probe-parallelism=4 \ + --stale-after=168h || true + exit 0 + + - name: Open or update rolling issue (on failure) + if: steps.probe.outputs.exit_code != '0' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const data = JSON.parse(fs.readFileSync('probe.json', 'utf8')); + const summary = data.summary || {}; + const bad = data.results.filter(r => r.status !== 'ok'); + + const label = 'snapshot-probe-stale'; + const body = [ + `Probe ran at \`${data.probed_at}\` and flagged **${bad.length}** of **${data.results.length}** sources as not OK.`, + '', + `Summary: \`${JSON.stringify(summary)}\``, + '', + '| status | network | kind | engine | domain | latest | age (d) | latency (ms) | detail |', + '|---|---|---|---|---|---|---|---|---|', + ...bad.map(r => `| ${r.status} | ${r.source.network} | ${r.source.kind} | ${r.source.engine} | ${r.source.domain} | ${r.latest_backup || '-'} | ${r.latest_age_days || '-'} | ${r.latency_ms} | ${r.err || ''} |`), + '', + 'Run locally to reproduce:', + '```bash', + 'trond snapshot sources --probe', + '```', + '', + `Edit \`internal/snapshot/sources.go\` to update mirror URLs once a working replacement is identified. Auto-filed by \`.github/workflows/snapshot-sources-probe.yml\`.`, + ].join('\n'); + + const { owner, repo } = context.repo; + const existing = await github.rest.issues.listForRepo({ + owner, repo, state: 'open', labels: label, per_page: 5, + }); + + if (existing.data.length === 0) { + const created = await github.rest.issues.create({ + owner, repo, + title: `[snapshot-probe] ${bad.length}/${data.results.length} mirrors unhealthy`, + body, + labels: [label], + }); + core.notice(`Opened issue #${created.data.number}`); + } else { + const issue = existing.data[0]; + await github.rest.issues.createComment({ + owner, repo, + issue_number: issue.number, + body: `Still unhealthy as of \`${data.probed_at}\`.\n\n${body}`, + }); + core.notice(`Commented on existing issue #${issue.number}`); + } + + - name: Auto-close rolling issue (on success) + if: steps.probe.outputs.exit_code == '0' + uses: actions/github-script@v7 + with: + script: | + const label = 'snapshot-probe-stale'; + const { owner, repo } = context.repo; + const open = await github.rest.issues.listForRepo({ + owner, repo, state: 'open', labels: label, per_page: 5, + }); + for (const issue of open.data) { + await github.rest.issues.createComment({ + owner, repo, + issue_number: issue.number, + body: `All ${(JSON.parse(require('fs').readFileSync('probe.json', 'utf8'))).results.length} sources OK on the latest probe — closing.`, + }); + await github.rest.issues.update({ + owner, repo, issue_number: issue.number, state: 'closed', + }); + core.notice(`Closed issue #${issue.number}`); + } + + - name: Upload probe.json + if: always() + uses: actions/upload-artifact@v4 + with: + name: probe-result + path: probe.json + retention-days: 30 + + - name: Fail job when any source not OK + if: steps.probe.outputs.exit_code != '0' + run: | + echo "snapshot probe reported failure; see issue / artifact for details" + exit 1 diff --git a/cmd/snapshot/sources.go b/cmd/snapshot/sources.go index 4f9355a1..b83778c1 100644 --- a/cmd/snapshot/sources.go +++ b/cmd/snapshot/sources.go @@ -1,9 +1,11 @@ package snapshot import ( + "context" "fmt" "os" "text/tabwriter" + "time" "github.com/spf13/cobra" @@ -13,21 +15,47 @@ import ( var sourcesCmd = &cobra.Command{ Use: "sources", - Short: "List known snapshot mirrors", + Short: "List or probe known snapshot mirrors", Long: `Print every snapshot source trond knows about, grouped by network and db kind (lite vs full). Pick one with --domain on download, or let -trond pick a default by passing --network and --type.`, +trond pick a default by passing --network and --type. + +Pass --probe to additionally HEAD-check each mirror and report which +ones still serve recent backups. Useful for CI to catch upstream +mirror rotations before users do — Task #161's structural follow-up.`, RunE: runSources, } +func init() { + sourcesCmd.Flags().Bool("probe", false, "HEAD-check every source and report reachability + freshness") + sourcesCmd.Flags().Duration("probe-timeout", 8*time.Second, "per-HEAD HTTP timeout when probing") + sourcesCmd.Flags().Duration("stale-after", 7*24*time.Hour, "age beyond which a reachable backup is reported as 'stale'") + sourcesCmd.Flags().Int("probe-parallelism", 5, "max concurrent HEAD checks during --probe") +} + func runSources(cmd *cobra.Command, _ []string) error { + probe, _ := cmd.Flags().GetBool("probe") outputFmt, _ := cmd.Flags().GetString("output") + + if !probe { + return printSourceTable(outputFmt) + } + + timeout, _ := cmd.Flags().GetDuration("probe-timeout") + stale, _ := cmd.Flags().GetDuration("stale-after") + parallelism, _ := cmd.Flags().GetInt("probe-parallelism") + return runProbe(cmd.Context(), outputFmt, snapshot.ProbeOptions{ + HTTPTimeout: timeout, + StaleAfter: stale, + }, parallelism) +} + +func printSourceTable(outputFmt string) error { if outputFmt == "json" { return output.WriteJSON(os.Stdout, map[string]any{ "sources": snapshot.SourceTable, }) } - tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) fmt.Fprintln(tw, "NETWORK\tKIND\tENGINE\tREGION\tDOMAIN\t~SIZE\tNOTES") for _, s := range snapshot.SourceTable { @@ -36,3 +64,60 @@ func runSources(cmd *cobra.Command, _ []string) error { } return tw.Flush() } + +// runProbe walks SourceTable, HEAD-checks each, and prints results. +// Returns a non-nil error when any source is not ProbeOK so a CI step +// fails cleanly (exit code 1) without us hand-rolling os.Exit. JSON +// output still prints the full report before erroring. +func runProbe(ctx context.Context, outputFmt string, opts snapshot.ProbeOptions, parallelism int) error { + if ctx == nil { + ctx = context.Background() + } + results := snapshot.ProbeAll(ctx, snapshot.SourceTable, opts, parallelism) + + summary := map[snapshot.ProbeStatus]int{} + for _, r := range results { + summary[r.Status]++ + } + + if outputFmt == "json" { + _ = output.WriteJSON(os.Stdout, map[string]any{ + "probed_at": time.Now().UTC().Format(time.RFC3339), + "results": results, + "summary": summary, + }) + } else { + printProbeTable(results, summary) + } + + if summary[snapshot.ProbeOK] != len(results) { + return fmt.Errorf("%d/%d sources not OK (stale=%d unreachable=%d no_backups=%d bad_config=%d)", + len(results)-summary[snapshot.ProbeOK], len(results), + summary[snapshot.ProbeStale], summary[snapshot.ProbeUnreachable], + summary[snapshot.ProbeNoBackups], summary[snapshot.ProbeBadConfig]) + } + return nil +} + +func printProbeTable(results []snapshot.ProbeResult, summary map[snapshot.ProbeStatus]int) { + tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) + fmt.Fprintln(tw, "STATUS\tNETWORK\tKIND\tENGINE\tDOMAIN\tLATEST\tAGE\tLATENCY\tDETAIL") + for _, r := range results { + latest := r.LatestBackup + if latest == "" { + latest = "-" + } + age := "-" + if r.LatestAgeDays > 0 { + age = fmt.Sprintf("%dd", r.LatestAgeDays) + } + fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%dms\t%s\n", + r.Status, r.Source.Network, r.Source.DBKind, r.Source.DBEngine, + r.Source.Domain, latest, age, r.LatencyMs, r.Err) + } + _ = tw.Flush() + fmt.Printf("\nsummary: ok=%d stale=%d unreachable=%d no_backups=%d bad_config=%d\n", + summary[snapshot.ProbeOK], summary[snapshot.ProbeStale], + summary[snapshot.ProbeUnreachable], summary[snapshot.ProbeNoBackups], + summary[snapshot.ProbeBadConfig]) +} diff --git a/internal/snapshot/probe.go b/internal/snapshot/probe.go new file mode 100644 index 00000000..0723fc92 --- /dev/null +++ b/internal/snapshot/probe.go @@ -0,0 +1,205 @@ +package snapshot + +import ( + "context" + "fmt" + "net/http" + "strings" + "time" +) + +// ProbeStatus is the outcome of a single Probe call. +type ProbeStatus string + +const ( + ProbeOK ProbeStatus = "ok" // recent backup returns 200 + ProbeStale ProbeStatus = "stale" // some backup returns 200, but it's older than staleAfter + ProbeUnreachable ProbeStatus = "unreachable" // no backup returns 200 within the candidate window + ProbeNoBackups ProbeStatus = "no_backups" // ListBackups returned an empty slice + ProbeBadConfig ProbeStatus = "bad_config" // source has an unknown IndexStrategy / missing fields +) + +// ProbeResult captures everything we want to surface back to a human +// reader or to a CI workflow. Source is embedded so a JSON consumer +// can group by Domain / Network without a separate lookup. +type ProbeResult struct { + Source Source `json:"source"` + Status ProbeStatus `json:"status"` + LatestBackup string `json:"latest_backup,omitempty"` // e.g. "backup20260524" + LatestAgeDays int `json:"latest_age_days,omitempty"` + LatencyMs int64 `json:"latency_ms,omitempty"` + Err string `json:"err,omitempty"` +} + +// ProbeOptions tunes Probe behaviour. Zero values are safe defaults +// (8s per-request HTTP timeout, 7d staleness threshold, 12 candidate +// HEADs before giving up). +type ProbeOptions struct { + HTTPTimeout time.Duration // per-HEAD timeout. 0 → 8s + StaleAfter time.Duration // age beyond which a working URL is "stale". 0 → 7 days + MaxCandidates int // how many backup names to HEAD-check. 0 → 12 + HTTPClient *http.Client // optional override (tests inject a fake) +} + +// Probe HEAD-checks the latest available tarball for a single Source. +// +// For "date"-strategy mirrors (nile) we walk the generated date list +// newest-to-oldest and stop at the first 200. For "html"-strategy +// mirrors (mainnet) we scrape the index page and likewise check from +// newest to oldest. +// +// We deliberately do NOT trust the existing snapshot.LatestBackup +// helper for the "date" case — that one returns the topmost candidate +// without HEAD-checking, which would tell us nothing about whether the +// upstream is actually serving anything. Probe's whole point is to +// observe real reachability. +func Probe(ctx context.Context, s Source, opts ProbeOptions) ProbeResult { + if opts.HTTPTimeout == 0 { + opts.HTTPTimeout = 8 * time.Second + } + if opts.StaleAfter == 0 { + opts.StaleAfter = 7 * 24 * time.Hour + } + if opts.MaxCandidates == 0 { + opts.MaxCandidates = 12 + } + client := opts.HTTPClient + if client == nil { + client = &http.Client{Timeout: opts.HTTPTimeout} + } + + res := ProbeResult{Source: s} + + candidates, err := listProbeCandidates(ctx, s, client) + if err != nil { + res.Status = ProbeBadConfig + res.Err = err.Error() + return res + } + if len(candidates) == 0 { + res.Status = ProbeNoBackups + res.Err = "no backups returned by source" + return res + } + + if len(candidates) > opts.MaxCandidates { + candidates = candidates[:opts.MaxCandidates] + } + + // Tarball kind: we prefer probing the SAME tarball variant a user + // would download — lite if the source has a lite tarball, otherwise + // full. The Nile rocksdb mirror publishes only FullNode_*.tgz. + kind := s.DBKind + + for _, backup := range candidates { + url := TarballURL(s, backup, kind) + if url == "" { + continue + } + start := time.Now() + ok, herr := headOK(ctx, client, url) + res.LatencyMs = time.Since(start).Milliseconds() + if herr != nil && res.Err == "" { + res.Err = herr.Error() + } + if !ok { + continue + } + // First hit wins. Decide ok vs stale based on the candidate's age. + ageDays := backupAgeDays(backup) + res.LatestBackup = backup + res.LatestAgeDays = ageDays + if time.Duration(ageDays)*24*time.Hour > opts.StaleAfter { + res.Status = ProbeStale + res.Err = fmt.Sprintf("most recent reachable backup is %dd old (threshold %s)", + ageDays, opts.StaleAfter) + } else { + res.Status = ProbeOK + res.Err = "" // clear any transient error from prior HEAD misses + } + return res + } + + res.Status = ProbeUnreachable + if res.Err == "" { + res.Err = "no candidate URL returned HTTP 200" + } + return res +} + +// listProbeCandidates returns plausible backup names newest-first for +// HEAD-checking. Thin shim over ListBackups so probe.go can be tested +// independently of the live index scrapers. +func listProbeCandidates(ctx context.Context, s Source, _ *http.Client) ([]string, error) { + switch s.IndexStrategy { + case "date", "html": + return ListBackups(ctx, s) + case "": + return nil, fmt.Errorf("source %s has no IndexStrategy set", s.Domain) + default: + return nil, fmt.Errorf("source %s has unknown IndexStrategy %q", s.Domain, s.IndexStrategy) + } +} + +// headOK returns true when a HEAD against url returns a 2xx. Surfaces +// transport errors separately so the caller can keep them as context. +func headOK(ctx context.Context, client *http.Client, url string) (bool, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil) + if err != nil { + return false, err + } + resp, err := client.Do(req) + if err != nil { + return false, err + } + defer resp.Body.Close() + return resp.StatusCode >= 200 && resp.StatusCode < 300, nil +} + +// backupAgeDays returns how many days ago a "backup20260524"-style name +// is, relative to time.Now().UTC(). Returns -1 when the name doesn't +// parse as a date — we treat that as "age unknown" rather than "fresh", +// so html-strategy mirrors that don't include a date in the directory +// name simply skip the staleness check (Status stays ProbeOK). +func backupAgeDays(backup string) int { + name := strings.TrimPrefix(backup, "backup") + name = strings.ReplaceAll(name, "-", "") + if len(name) != 8 { + return -1 + } + t, err := time.Parse("20060102", name) + if err != nil { + return -1 + } + d := time.Since(t) + if d < 0 { + return 0 + } + return int(d / (24 * time.Hour)) +} + +// ProbeAll concurrently probes every source in sources. The slice is +// returned in the same order as the input. +func ProbeAll(ctx context.Context, sources []Source, opts ProbeOptions, parallelism int) []ProbeResult { + if parallelism <= 0 { + parallelism = 5 + } + results := make([]ProbeResult, len(sources)) + sem := make(chan struct{}, parallelism) + done := make(chan int, len(sources)) + + for i, src := range sources { + sem <- struct{}{} + go func(idx int, s Source) { + defer func() { + <-sem + done <- idx + }() + results[idx] = Probe(ctx, s, opts) + }(i, src) + } + for range sources { + <-done + } + return results +} diff --git a/internal/snapshot/probe_test.go b/internal/snapshot/probe_test.go new file mode 100644 index 00000000..54a8ac39 --- /dev/null +++ b/internal/snapshot/probe_test.go @@ -0,0 +1,158 @@ +package snapshot + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// buildProbeMirror returns a Source wired to a fresh httptest server +// that serves a LiteFullNode tarball at exactly the backup dates +// passed in. A HEAD to any other backup path returns 404. The server +// itself is cleaned up via t.Cleanup — callers never need to touch +// it, which is why only Source is returned (unparam-clean). +func buildProbeMirror(t *testing.T, serveDates ...string) Source { + t.Helper() + mux := http.NewServeMux() + for _, d := range serveDates { + path := fmt.Sprintf("/backup%s/LiteFullNode_output-directory.tgz", d) + mux.HandleFunc(path, func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Length", "1024") + w.WriteHeader(http.StatusOK) + }) + } + mux.HandleFunc("/", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + srv := httptest.NewServer(mux) + t.Cleanup(srv.Close) + return Source{ + Network: NetworkNile, + DBKind: DBKindLite, + DBEngine: EngineLevelDB, + Domain: "test.local", + BaseURL: srv.URL, + IndexStrategy: "date", + } +} + +func TestProbe_OKWhenRecentBackupServes(t *testing.T) { + // Yesterday is always in the generated date list (i starts at 1). + yesterday := time.Now().UTC().AddDate(0, 0, -1).Format("20060102") + src := buildProbeMirror(t, yesterday) + + r := Probe(context.Background(), src, ProbeOptions{ + HTTPTimeout: 2 * time.Second, + }) + if r.Status != ProbeOK { + t.Fatalf("status: want %s, got %s (err=%s)", ProbeOK, r.Status, r.Err) + } + if r.LatestBackup != "backup"+yesterday { + t.Fatalf("LatestBackup: want backup%s, got %s", yesterday, r.LatestBackup) + } + if r.LatestAgeDays != 1 { + t.Fatalf("LatestAgeDays: want 1, got %d", r.LatestAgeDays) + } +} + +func TestProbe_StaleWhenOnlyOldBackupServes(t *testing.T) { + // A backup older than the staleness threshold but still inside the + // generated date list. 35 days back is in the "10/20/30" tier of + // generateDateList. To guarantee it lands on a 10/20/30 day, pick a + // fixed-but-rolling target. + old := time.Now().UTC().AddDate(0, 0, -45) + // Snap to the nearest 10/20/30 of that month, going backwards. + for d := old.Day(); d > 0; d-- { + if d == 10 || d == 20 || d == 30 { + old = time.Date(old.Year(), old.Month(), d, 0, 0, 0, 0, time.UTC) + break + } + } + oldStr := old.Format("20060102") + src := buildProbeMirror(t, oldStr) + + r := Probe(context.Background(), src, ProbeOptions{ + HTTPTimeout: 2 * time.Second, + StaleAfter: 7 * 24 * time.Hour, + MaxCandidates: 100, // walk far enough back to hit the old date + }) + if r.Status != ProbeStale { + t.Fatalf("status: want %s, got %s (err=%s, latest=%s)", + ProbeStale, r.Status, r.Err, r.LatestBackup) + } + if r.LatestAgeDays < 7 { + t.Fatalf("expected age > threshold, got %d days", r.LatestAgeDays) + } +} + +func TestProbe_UnreachableWhenNothingServes(t *testing.T) { + src := buildProbeMirror(t /* no served dates */) + r := Probe(context.Background(), src, ProbeOptions{ + HTTPTimeout: 1 * time.Second, + MaxCandidates: 5, + }) + if r.Status != ProbeUnreachable { + t.Fatalf("status: want %s, got %s", ProbeUnreachable, r.Status) + } +} + +func TestProbe_BadConfigOnUnknownStrategy(t *testing.T) { + src := Source{IndexStrategy: "wat", BaseURL: "http://localhost"} + r := Probe(context.Background(), src, ProbeOptions{}) + if r.Status != ProbeBadConfig { + t.Fatalf("status: want %s, got %s", ProbeBadConfig, r.Status) + } +} + +func TestBackupAgeDays(t *testing.T) { + now := time.Now().UTC() + cases := []struct { + name string + input string + wantOK bool + }{ + {"unhyphenated 5d", "backup" + now.AddDate(0, 0, -5).Format("20060102"), true}, + {"hyphenated 3d", "backup" + now.AddDate(0, 0, -3).Format("2006-01-02"), true}, + {"unparseable", "backupNOPE", false}, + {"empty", "", false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := backupAgeDays(c.input) + if c.wantOK && got < 0 { + t.Errorf("input %q: expected a non-negative age, got %d", c.input, got) + } + if !c.wantOK && got >= 0 { + t.Errorf("input %q: expected -1 sentinel, got %d", c.input, got) + } + }) + } +} + +func TestProbeAll_PreservesInputOrder(t *testing.T) { + yesterday := time.Now().UTC().AddDate(0, 0, -1).Format("20060102") + srcA := buildProbeMirror(t, yesterday) + srcA.Domain = "alpha" + srcB := buildProbeMirror(t /* nothing */) + srcB.Domain = "bravo" + results := ProbeAll(context.Background(), []Source{srcA, srcB}, ProbeOptions{ + HTTPTimeout: 1 * time.Second, + MaxCandidates: 3, + }, 2) + if len(results) != 2 { + t.Fatalf("want 2 results, got %d", len(results)) + } + if results[0].Source.Domain != "alpha" || results[1].Source.Domain != "bravo" { + t.Fatalf("order not preserved: %s, %s", + results[0].Source.Domain, results[1].Source.Domain) + } + if results[0].Status != ProbeOK { + t.Errorf("alpha: want ok, got %s", results[0].Status) + } + if results[1].Status != ProbeUnreachable { + t.Errorf("bravo: want unreachable, got %s", results[1].Status) + } +}