Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions .github/workflows/snapshot-sources-probe.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
name: Snapshot Sources Probe

# Weekly HEAD-check of every upstream snapshot mirror trond knows about.
# The structural follow-up to Task #161 — when an upstream rotates an IP
# or pulls a bucket (as the Nile S3 mirror did), the probe surfaces it
# here rather than in a confused user's bug report.
#
# Policy: open exactly one rolling issue while any source is unhealthy.
# Subsequent failures append a comment to that issue instead of spawning
# duplicates. When all sources go green again, the workflow closes the
# issue automatically.

on:
schedule:
# Mondays 09:00 UTC. Early enough that whoever triages on Monday
# morning sees fresh results; late enough that European weekday
# cadence isn't disturbed if a flake comes through.
- cron: "0 9 * * 1"
workflow_dispatch: {}

permissions:
contents: read
issues: write

jobs:
probe:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: "1.25"
cache: true

- name: Build trond
run: go build -o bin/trond ./

- name: Probe sources
id: probe
run: |
set +e
./bin/trond snapshot sources --probe \
--probe-timeout=10s \
--probe-parallelism=4 \
--stale-after=168h \
--output json > probe.json
code=$?
echo "exit_code=$code" >> "$GITHUB_OUTPUT"
echo "--- text summary ---"
./bin/trond snapshot sources --probe \
--probe-timeout=10s \
--probe-parallelism=4 \
--stale-after=168h || true
exit 0

- name: Open or update rolling issue (on failure)
if: steps.probe.outputs.exit_code != '0'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const data = JSON.parse(fs.readFileSync('probe.json', 'utf8'));
const summary = data.summary || {};
const bad = data.results.filter(r => r.status !== 'ok');

const label = 'snapshot-probe-stale';
const body = [
`Probe ran at \`${data.probed_at}\` and flagged **${bad.length}** of **${data.results.length}** sources as not OK.`,
'',
`Summary: \`${JSON.stringify(summary)}\``,
'',
'| status | network | kind | engine | domain | latest | age (d) | latency (ms) | detail |',
'|---|---|---|---|---|---|---|---|---|',
...bad.map(r => `| ${r.status} | ${r.source.network} | ${r.source.kind} | ${r.source.engine} | ${r.source.domain} | ${r.latest_backup || '-'} | ${r.latest_age_days || '-'} | ${r.latency_ms} | ${r.err || ''} |`),
'',
'Run locally to reproduce:',
'```bash',
'trond snapshot sources --probe',
'```',
'',
`Edit \`internal/snapshot/sources.go\` to update mirror URLs once a working replacement is identified. Auto-filed by \`.github/workflows/snapshot-sources-probe.yml\`.`,
].join('\n');

const { owner, repo } = context.repo;
const existing = await github.rest.issues.listForRepo({
owner, repo, state: 'open', labels: label, per_page: 5,
});

if (existing.data.length === 0) {
const created = await github.rest.issues.create({
owner, repo,
title: `[snapshot-probe] ${bad.length}/${data.results.length} mirrors unhealthy`,
body,
labels: [label],
});
core.notice(`Opened issue #${created.data.number}`);
} else {
const issue = existing.data[0];
await github.rest.issues.createComment({
owner, repo,
issue_number: issue.number,
body: `Still unhealthy as of \`${data.probed_at}\`.\n\n${body}`,
});
core.notice(`Commented on existing issue #${issue.number}`);
}

- name: Auto-close rolling issue (on success)
if: steps.probe.outputs.exit_code == '0'
uses: actions/github-script@v7
with:
script: |
const label = 'snapshot-probe-stale';
const { owner, repo } = context.repo;
const open = await github.rest.issues.listForRepo({
owner, repo, state: 'open', labels: label, per_page: 5,
});
for (const issue of open.data) {
await github.rest.issues.createComment({
owner, repo,
issue_number: issue.number,
body: `All ${(JSON.parse(require('fs').readFileSync('probe.json', 'utf8'))).results.length} sources OK on the latest probe — closing.`,
});
await github.rest.issues.update({
owner, repo, issue_number: issue.number, state: 'closed',
});
core.notice(`Closed issue #${issue.number}`);
}

- name: Upload probe.json
if: always()
uses: actions/upload-artifact@v4
with:
name: probe-result
path: probe.json
retention-days: 30

- name: Fail job when any source not OK
if: steps.probe.outputs.exit_code != '0'
run: |
echo "snapshot probe reported failure; see issue / artifact for details"
exit 1
91 changes: 88 additions & 3 deletions cmd/snapshot/sources.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package snapshot

import (
"context"
"fmt"
"os"
"text/tabwriter"
"time"

"github.com/spf13/cobra"

Expand All @@ -13,21 +15,47 @@ import (

var sourcesCmd = &cobra.Command{
Use: "sources",
Short: "List known snapshot mirrors",
Short: "List or probe known snapshot mirrors",
Long: `Print every snapshot source trond knows about, grouped by network and
db kind (lite vs full). Pick one with --domain on download, or let
trond pick a default by passing --network and --type.`,
trond pick a default by passing --network and --type.

Pass --probe to additionally HEAD-check each mirror and report which
ones still serve recent backups. Useful for CI to catch upstream
mirror rotations before users do — Task #161's structural follow-up.`,
RunE: runSources,
}

func init() {
sourcesCmd.Flags().Bool("probe", false, "HEAD-check every source and report reachability + freshness")
sourcesCmd.Flags().Duration("probe-timeout", 8*time.Second, "per-HEAD HTTP timeout when probing")
sourcesCmd.Flags().Duration("stale-after", 7*24*time.Hour, "age beyond which a reachable backup is reported as 'stale'")
sourcesCmd.Flags().Int("probe-parallelism", 5, "max concurrent HEAD checks during --probe")
}

func runSources(cmd *cobra.Command, _ []string) error {
probe, _ := cmd.Flags().GetBool("probe")
outputFmt, _ := cmd.Flags().GetString("output")

if !probe {
return printSourceTable(outputFmt)
}

timeout, _ := cmd.Flags().GetDuration("probe-timeout")
stale, _ := cmd.Flags().GetDuration("stale-after")
parallelism, _ := cmd.Flags().GetInt("probe-parallelism")
return runProbe(cmd.Context(), outputFmt, snapshot.ProbeOptions{
HTTPTimeout: timeout,
StaleAfter: stale,
}, parallelism)
}

func printSourceTable(outputFmt string) error {
if outputFmt == "json" {
return output.WriteJSON(os.Stdout, map[string]any{
"sources": snapshot.SourceTable,
})
}

tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
fmt.Fprintln(tw, "NETWORK\tKIND\tENGINE\tREGION\tDOMAIN\t~SIZE\tNOTES")
for _, s := range snapshot.SourceTable {
Expand All @@ -36,3 +64,60 @@ func runSources(cmd *cobra.Command, _ []string) error {
}
return tw.Flush()
}

// runProbe walks SourceTable, HEAD-checks each, and prints results.
// Returns a non-nil error when any source is not ProbeOK so a CI step
// fails cleanly (exit code 1) without us hand-rolling os.Exit. JSON
// output still prints the full report before erroring.
func runProbe(ctx context.Context, outputFmt string, opts snapshot.ProbeOptions, parallelism int) error {
if ctx == nil {
ctx = context.Background()
}
results := snapshot.ProbeAll(ctx, snapshot.SourceTable, opts, parallelism)

summary := map[snapshot.ProbeStatus]int{}
for _, r := range results {
summary[r.Status]++
}

if outputFmt == "json" {
_ = output.WriteJSON(os.Stdout, map[string]any{
"probed_at": time.Now().UTC().Format(time.RFC3339),
"results": results,
"summary": summary,
})
} else {
printProbeTable(results, summary)
}

if summary[snapshot.ProbeOK] != len(results) {
return fmt.Errorf("%d/%d sources not OK (stale=%d unreachable=%d no_backups=%d bad_config=%d)",
len(results)-summary[snapshot.ProbeOK], len(results),
summary[snapshot.ProbeStale], summary[snapshot.ProbeUnreachable],
summary[snapshot.ProbeNoBackups], summary[snapshot.ProbeBadConfig])
}
return nil
}

func printProbeTable(results []snapshot.ProbeResult, summary map[snapshot.ProbeStatus]int) {
tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
fmt.Fprintln(tw, "STATUS\tNETWORK\tKIND\tENGINE\tDOMAIN\tLATEST\tAGE\tLATENCY\tDETAIL")
for _, r := range results {
latest := r.LatestBackup
if latest == "" {
latest = "-"
}
age := "-"
if r.LatestAgeDays > 0 {
age = fmt.Sprintf("%dd", r.LatestAgeDays)
}
fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%dms\t%s\n",
r.Status, r.Source.Network, r.Source.DBKind, r.Source.DBEngine,
r.Source.Domain, latest, age, r.LatencyMs, r.Err)
}
_ = tw.Flush()
fmt.Printf("\nsummary: ok=%d stale=%d unreachable=%d no_backups=%d bad_config=%d\n",
summary[snapshot.ProbeOK], summary[snapshot.ProbeStale],
summary[snapshot.ProbeUnreachable], summary[snapshot.ProbeNoBackups],
summary[snapshot.ProbeBadConfig])
}
Loading
Loading