|
| 1 | +package cmd |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "os" |
| 7 | + "strings" |
| 8 | + "time" |
| 9 | + |
| 10 | + "github.com/spf13/cobra" |
| 11 | + |
| 12 | + "github.com/tronprotocol/tron-deployment/internal/diagnosis" |
| 13 | + "github.com/tronprotocol/tron-deployment/internal/output" |
| 14 | +) |
| 15 | + |
| 16 | +// `trond heal <node>` is the auto-fix counterpart to `trond diagnose`. |
| 17 | +// It runs the same diagnostic suite and, for each failed check whose |
| 18 | +// remediation is *known and safe*, applies the fix automatically. |
| 19 | +// Anything destructive (remove, network destroy, --force) stays in |
| 20 | +// human hands. |
| 21 | +// |
| 22 | +// Conservative by design: a v1 healer that auto-restarts on every |
| 23 | +// fail signal would run away — sync_progress=fail just means "wait |
| 24 | +// longer," not "restart." We map specific (check, current state) |
| 25 | +// tuples to specific actions and explicitly skip the rest, surfacing |
| 26 | +// suggestions[] so the operator can pick up where heal stopped. |
| 27 | +// |
| 28 | +// Output schema: schemas/output/heal.schema.json. |
| 29 | +// |
| 30 | +// Idempotent: safe to re-run. If the previous run fixed everything, |
| 31 | +// the next one returns healed=[] / skipped=[] / still_failing=[]. |
| 32 | + |
| 33 | +var ( |
| 34 | + healDryRun bool |
| 35 | + healOnly []string // restrict to specific check names; defaults = all |
| 36 | +) |
| 37 | + |
| 38 | +var autoHealCmd = &cobra.Command{ |
| 39 | + Use: "auto-heal <node>", |
| 40 | + Short: "Run diagnose, then auto-fix the failures whose remediation is known + safe", |
| 41 | + Long: `Heal walks trond's diagnose output and attempts the documented |
| 42 | +remediation for each fail. Read-only inspection (status, diagnose); |
| 43 | +destructive actions stay behind a HUMAN_REQUIRED gate. |
| 44 | +
|
| 45 | +Currently auto-fixable: |
| 46 | + port_listening=fail and node.status=stopped → trond start <node> |
| 47 | +
|
| 48 | +Surfaced for human action (heal does NOT touch them): |
| 49 | + sync_progress=fail — node is alive, just behind. Wait. |
| 50 | + peer_count=fail — recovers on its own when peers come back. |
| 51 | + disk_space=fail — needs operator attention. |
| 52 | + memory_usage=fail — needs operator attention. |
| 53 | +
|
| 54 | +Use --dry-run to see what heal would do without acting.`, |
| 55 | + Args: cobra.ExactArgs(1), |
| 56 | + RunE: runAutoHeal, |
| 57 | +} |
| 58 | + |
| 59 | +func init() { |
| 60 | + autoHealCmd.Flags().BoolVar(&healDryRun, "dry-run", false, |
| 61 | + "Print proposed actions without executing them") |
| 62 | + autoHealCmd.Flags().StringSliceVar(&healOnly, "only", nil, |
| 63 | + "Comma-separated check names to consider; default = all checks") |
| 64 | + rootCmd.AddCommand(autoHealCmd) |
| 65 | +} |
| 66 | + |
| 67 | +// healAction is one auto-fix attempt: which check triggered it, |
| 68 | +// what action we ran, and whether it succeeded. |
| 69 | +type healAction struct { |
| 70 | + Check string `json:"check"` |
| 71 | + Action string `json:"action"` |
| 72 | + Result string `json:"result"` // succeeded | failed | dry_run |
| 73 | + Message string `json:"message,omitempty"` |
| 74 | +} |
| 75 | + |
| 76 | +// healSkip records a fail check we intentionally didn't auto-fix |
| 77 | +// (with the reason). Surfaces the suggestions[] so the operator |
| 78 | +// has the same context the agent would. |
| 79 | +type healSkip struct { |
| 80 | + Check string `json:"check"` |
| 81 | + Reason string `json:"reason"` |
| 82 | + Suggestions []string `json:"suggestions,omitempty"` |
| 83 | +} |
| 84 | + |
| 85 | +func runAutoHeal(cmd *cobra.Command, args []string) error { |
| 86 | + name := args[0] |
| 87 | + start := time.Now() |
| 88 | + |
| 89 | + nc, err := resolveNodeContext(name) |
| 90 | + if err != nil { |
| 91 | + return err |
| 92 | + } |
| 93 | + defer nc.Close() |
| 94 | + |
| 95 | + // Run the same checker matrix `trond diagnose` does. |
| 96 | + opts := diagnosis.CheckOpts{ |
| 97 | + NodeName: nc.Node.Name, |
| 98 | + NodeType: "", // diagnose doesn't read this from state today; checkers cope |
| 99 | + Runtime: nc.Node.Runtime, |
| 100 | + HTTPPort: nc.Node.HTTPPort, |
| 101 | + GRPCPort: nc.Node.GRPCPort, |
| 102 | + InstallPath: nc.Node.InstallPath, |
| 103 | + } |
| 104 | + ctx, cancel := context.WithTimeout(cmd.Context(), 30*time.Second) |
| 105 | + defer cancel() |
| 106 | + checkers := diagnosis.AllCheckers() |
| 107 | + results := make([]diagnosis.CheckResult, 0, len(checkers)) |
| 108 | + for _, c := range checkers { |
| 109 | + if len(healOnly) > 0 && !contains(healOnly, c.Name()) { |
| 110 | + continue |
| 111 | + } |
| 112 | + results = append(results, c.Run(ctx, nc.Target, opts)) |
| 113 | + } |
| 114 | + |
| 115 | + var ( |
| 116 | + healed []healAction |
| 117 | + skipped []healSkip |
| 118 | + stillFailing []diagnosis.CheckResult |
| 119 | + ) |
| 120 | + |
| 121 | + for _, r := range results { |
| 122 | + if r.Status != diagnosis.StatusFail { |
| 123 | + continue |
| 124 | + } |
| 125 | + action, ok := proposeHealAction(r, nc.Node.Status) |
| 126 | + if !ok { |
| 127 | + skipped = append(skipped, healSkip{ |
| 128 | + Check: r.Name, |
| 129 | + Reason: "no auto-fix mapped (manual remediation required)", |
| 130 | + Suggestions: r.Suggestions, |
| 131 | + }) |
| 132 | + stillFailing = append(stillFailing, r) |
| 133 | + continue |
| 134 | + } |
| 135 | + if healDryRun { |
| 136 | + healed = append(healed, healAction{ |
| 137 | + Check: r.Name, |
| 138 | + Action: action.Action, |
| 139 | + Result: "dry_run", |
| 140 | + Message: action.Message, |
| 141 | + }) |
| 142 | + continue |
| 143 | + } |
| 144 | + if err := executeHealAction(cmd.Context(), nc, action); err != nil { |
| 145 | + healed = append(healed, healAction{ |
| 146 | + Check: r.Name, |
| 147 | + Action: action.Action, |
| 148 | + Result: "failed", |
| 149 | + Message: err.Error(), |
| 150 | + }) |
| 151 | + stillFailing = append(stillFailing, r) |
| 152 | + continue |
| 153 | + } |
| 154 | + healed = append(healed, healAction{ |
| 155 | + Check: r.Name, |
| 156 | + Action: action.Action, |
| 157 | + Result: "succeeded", |
| 158 | + Message: action.Message, |
| 159 | + }) |
| 160 | + } |
| 161 | + |
| 162 | + result := map[string]any{ |
| 163 | + "name": name, |
| 164 | + "healed": healed, |
| 165 | + "skipped": skipped, |
| 166 | + "still_failing": stillFailing, |
| 167 | + "duration_ms": time.Since(start).Milliseconds(), |
| 168 | + "dry_run": healDryRun, |
| 169 | + } |
| 170 | + |
| 171 | + outputFmt, _ := cmd.Flags().GetString("output") |
| 172 | + if outputFmt == "json" { |
| 173 | + return output.WriteJSON(os.Stdout, result) |
| 174 | + } |
| 175 | + if len(healed) == 0 && len(skipped) == 0 { |
| 176 | + fmt.Printf("✓ %s: no failed checks; nothing to heal.\n", name) |
| 177 | + return nil |
| 178 | + } |
| 179 | + for _, h := range healed { |
| 180 | + fmt.Printf("[%s] %s → %s: %s\n", h.Result, h.Check, h.Action, h.Message) |
| 181 | + } |
| 182 | + for _, s := range skipped { |
| 183 | + fmt.Printf("[skipped] %s: %s\n", s.Check, s.Reason) |
| 184 | + for _, sg := range s.Suggestions { |
| 185 | + fmt.Printf(" - %s\n", sg) |
| 186 | + } |
| 187 | + } |
| 188 | + return nil |
| 189 | +} |
| 190 | + |
| 191 | +// proposeHealAction maps (check.Name, current state) tuples to a |
| 192 | +// concrete fix. Returns ok=false when no automatic remediation is |
| 193 | +// safe — the caller surfaces suggestions[] instead. |
| 194 | +// |
| 195 | +// Adding a new auto-fix means landing both: |
| 196 | +// |
| 197 | +// 1. A case here that returns ok=true plus a healAction definition. |
| 198 | +// 2. A test in cmd/heal_test.go pinning the (check, state) tuple |
| 199 | +// so the mapping doesn't silently drift. |
| 200 | +func proposeHealAction(r diagnosis.CheckResult, nodeStatus string) (proposedAction, bool) { |
| 201 | + // Switch (rather than if-else) so future cases land cleanly: |
| 202 | + // each (check, state) tuple gets one case + one test row. |
| 203 | + //nolint:gocritic // single-case today; will grow per the package doc above. |
| 204 | + switch { |
| 205 | + case r.Name == "port_listening" && nodeStatus == "stopped": |
| 206 | + return proposedAction{ |
| 207 | + Action: "start", |
| 208 | + Message: "node was marked stopped in state; bringing it back up", |
| 209 | + }, true |
| 210 | + } |
| 211 | + return proposedAction{}, false |
| 212 | +} |
| 213 | + |
| 214 | +type proposedAction struct { |
| 215 | + Action string // "start" | "restart" | future actions |
| 216 | + Message string |
| 217 | +} |
| 218 | + |
| 219 | +// executeHealAction runs the proposed action against the node. We |
| 220 | +// reuse the existing trond commands' machinery rather than calling |
| 221 | +// docker / systemd directly so audit logs + state updates flow |
| 222 | +// through the same path a manual `trond start` would. |
| 223 | +func executeHealAction(ctx context.Context, nc *nodeContext, action proposedAction) error { |
| 224 | + if action.Action == "start" { |
| 225 | + // Mirror cmd/start.go without going through cobra. The |
| 226 | + // runtime's Start handles docker compose start / systemctl |
| 227 | + // start uniformly. |
| 228 | + if err := nc.Runtime.Start(ctx, nc.Node.Name); err != nil { |
| 229 | + return err |
| 230 | + } |
| 231 | + nc.Node.Status = "running" |
| 232 | + return nc.SaveState() |
| 233 | + } |
| 234 | + return fmt.Errorf("unknown heal action %q", action.Action) |
| 235 | +} |
| 236 | + |
| 237 | +func contains(haystack []string, needle string) bool { |
| 238 | + for _, s := range haystack { |
| 239 | + if strings.EqualFold(s, needle) { |
| 240 | + return true |
| 241 | + } |
| 242 | + } |
| 243 | + return false |
| 244 | +} |
0 commit comments