Skip to content

Commit 43b3fcb

Browse files
authored
Merge pull request #166 from barbatos2011/feat/heal-and-resource-templates
auto-heal + MCP resource templates
2 parents 0e22f35 + ef2eb8d commit 43b3fcb

15 files changed

Lines changed: 833 additions & 83 deletions

cmd/heal.go

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
package cmd
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
"strings"
8+
"time"
9+
10+
"github.com/spf13/cobra"
11+
12+
"github.com/tronprotocol/tron-deployment/internal/diagnosis"
13+
"github.com/tronprotocol/tron-deployment/internal/output"
14+
)
15+
16+
// `trond heal <node>` is the auto-fix counterpart to `trond diagnose`.
17+
// It runs the same diagnostic suite and, for each failed check whose
18+
// remediation is *known and safe*, applies the fix automatically.
19+
// Anything destructive (remove, network destroy, --force) stays in
20+
// human hands.
21+
//
22+
// Conservative by design: a v1 healer that auto-restarts on every
23+
// fail signal would run away — sync_progress=fail just means "wait
24+
// longer," not "restart." We map specific (check, current state)
25+
// tuples to specific actions and explicitly skip the rest, surfacing
26+
// suggestions[] so the operator can pick up where heal stopped.
27+
//
28+
// Output schema: schemas/output/heal.schema.json.
29+
//
30+
// Idempotent: safe to re-run. If the previous run fixed everything,
31+
// the next one returns healed=[] / skipped=[] / still_failing=[].
32+
33+
var (
34+
healDryRun bool
35+
healOnly []string // restrict to specific check names; defaults = all
36+
)
37+
38+
var autoHealCmd = &cobra.Command{
39+
Use: "auto-heal <node>",
40+
Short: "Run diagnose, then auto-fix the failures whose remediation is known + safe",
41+
Long: `Heal walks trond's diagnose output and attempts the documented
42+
remediation for each fail. Read-only inspection (status, diagnose);
43+
destructive actions stay behind a HUMAN_REQUIRED gate.
44+
45+
Currently auto-fixable:
46+
port_listening=fail and node.status=stopped → trond start <node>
47+
48+
Surfaced for human action (heal does NOT touch them):
49+
sync_progress=fail — node is alive, just behind. Wait.
50+
peer_count=fail — recovers on its own when peers come back.
51+
disk_space=fail — needs operator attention.
52+
memory_usage=fail — needs operator attention.
53+
54+
Use --dry-run to see what heal would do without acting.`,
55+
Args: cobra.ExactArgs(1),
56+
RunE: runAutoHeal,
57+
}
58+
59+
func init() {
60+
autoHealCmd.Flags().BoolVar(&healDryRun, "dry-run", false,
61+
"Print proposed actions without executing them")
62+
autoHealCmd.Flags().StringSliceVar(&healOnly, "only", nil,
63+
"Comma-separated check names to consider; default = all checks")
64+
rootCmd.AddCommand(autoHealCmd)
65+
}
66+
67+
// healAction is one auto-fix attempt: which check triggered it,
68+
// what action we ran, and whether it succeeded.
69+
type healAction struct {
70+
Check string `json:"check"`
71+
Action string `json:"action"`
72+
Result string `json:"result"` // succeeded | failed | dry_run
73+
Message string `json:"message,omitempty"`
74+
}
75+
76+
// healSkip records a fail check we intentionally didn't auto-fix
77+
// (with the reason). Surfaces the suggestions[] so the operator
78+
// has the same context the agent would.
79+
type healSkip struct {
80+
Check string `json:"check"`
81+
Reason string `json:"reason"`
82+
Suggestions []string `json:"suggestions,omitempty"`
83+
}
84+
85+
func runAutoHeal(cmd *cobra.Command, args []string) error {
86+
name := args[0]
87+
start := time.Now()
88+
89+
nc, err := resolveNodeContext(name)
90+
if err != nil {
91+
return err
92+
}
93+
defer nc.Close()
94+
95+
// Run the same checker matrix `trond diagnose` does.
96+
opts := diagnosis.CheckOpts{
97+
NodeName: nc.Node.Name,
98+
NodeType: "", // diagnose doesn't read this from state today; checkers cope
99+
Runtime: nc.Node.Runtime,
100+
HTTPPort: nc.Node.HTTPPort,
101+
GRPCPort: nc.Node.GRPCPort,
102+
InstallPath: nc.Node.InstallPath,
103+
}
104+
ctx, cancel := context.WithTimeout(cmd.Context(), 30*time.Second)
105+
defer cancel()
106+
checkers := diagnosis.AllCheckers()
107+
results := make([]diagnosis.CheckResult, 0, len(checkers))
108+
for _, c := range checkers {
109+
if len(healOnly) > 0 && !contains(healOnly, c.Name()) {
110+
continue
111+
}
112+
results = append(results, c.Run(ctx, nc.Target, opts))
113+
}
114+
115+
var (
116+
healed []healAction
117+
skipped []healSkip
118+
stillFailing []diagnosis.CheckResult
119+
)
120+
121+
for _, r := range results {
122+
if r.Status != diagnosis.StatusFail {
123+
continue
124+
}
125+
action, ok := proposeHealAction(r, nc.Node.Status)
126+
if !ok {
127+
skipped = append(skipped, healSkip{
128+
Check: r.Name,
129+
Reason: "no auto-fix mapped (manual remediation required)",
130+
Suggestions: r.Suggestions,
131+
})
132+
stillFailing = append(stillFailing, r)
133+
continue
134+
}
135+
if healDryRun {
136+
healed = append(healed, healAction{
137+
Check: r.Name,
138+
Action: action.Action,
139+
Result: "dry_run",
140+
Message: action.Message,
141+
})
142+
continue
143+
}
144+
if err := executeHealAction(cmd.Context(), nc, action); err != nil {
145+
healed = append(healed, healAction{
146+
Check: r.Name,
147+
Action: action.Action,
148+
Result: "failed",
149+
Message: err.Error(),
150+
})
151+
stillFailing = append(stillFailing, r)
152+
continue
153+
}
154+
healed = append(healed, healAction{
155+
Check: r.Name,
156+
Action: action.Action,
157+
Result: "succeeded",
158+
Message: action.Message,
159+
})
160+
}
161+
162+
result := map[string]any{
163+
"name": name,
164+
"healed": healed,
165+
"skipped": skipped,
166+
"still_failing": stillFailing,
167+
"duration_ms": time.Since(start).Milliseconds(),
168+
"dry_run": healDryRun,
169+
}
170+
171+
outputFmt, _ := cmd.Flags().GetString("output")
172+
if outputFmt == "json" {
173+
return output.WriteJSON(os.Stdout, result)
174+
}
175+
if len(healed) == 0 && len(skipped) == 0 {
176+
fmt.Printf("✓ %s: no failed checks; nothing to heal.\n", name)
177+
return nil
178+
}
179+
for _, h := range healed {
180+
fmt.Printf("[%s] %s → %s: %s\n", h.Result, h.Check, h.Action, h.Message)
181+
}
182+
for _, s := range skipped {
183+
fmt.Printf("[skipped] %s: %s\n", s.Check, s.Reason)
184+
for _, sg := range s.Suggestions {
185+
fmt.Printf(" - %s\n", sg)
186+
}
187+
}
188+
return nil
189+
}
190+
191+
// proposeHealAction maps (check.Name, current state) tuples to a
192+
// concrete fix. Returns ok=false when no automatic remediation is
193+
// safe — the caller surfaces suggestions[] instead.
194+
//
195+
// Adding a new auto-fix means landing both:
196+
//
197+
// 1. A case here that returns ok=true plus a healAction definition.
198+
// 2. A test in cmd/heal_test.go pinning the (check, state) tuple
199+
// so the mapping doesn't silently drift.
200+
func proposeHealAction(r diagnosis.CheckResult, nodeStatus string) (proposedAction, bool) {
201+
// Switch (rather than if-else) so future cases land cleanly:
202+
// each (check, state) tuple gets one case + one test row.
203+
//nolint:gocritic // single-case today; will grow per the package doc above.
204+
switch {
205+
case r.Name == "port_listening" && nodeStatus == "stopped":
206+
return proposedAction{
207+
Action: "start",
208+
Message: "node was marked stopped in state; bringing it back up",
209+
}, true
210+
}
211+
return proposedAction{}, false
212+
}
213+
214+
type proposedAction struct {
215+
Action string // "start" | "restart" | future actions
216+
Message string
217+
}
218+
219+
// executeHealAction runs the proposed action against the node. We
220+
// reuse the existing trond commands' machinery rather than calling
221+
// docker / systemd directly so audit logs + state updates flow
222+
// through the same path a manual `trond start` would.
223+
func executeHealAction(ctx context.Context, nc *nodeContext, action proposedAction) error {
224+
if action.Action == "start" {
225+
// Mirror cmd/start.go without going through cobra. The
226+
// runtime's Start handles docker compose start / systemctl
227+
// start uniformly.
228+
if err := nc.Runtime.Start(ctx, nc.Node.Name); err != nil {
229+
return err
230+
}
231+
nc.Node.Status = "running"
232+
return nc.SaveState()
233+
}
234+
return fmt.Errorf("unknown heal action %q", action.Action)
235+
}
236+
237+
func contains(haystack []string, needle string) bool {
238+
for _, s := range haystack {
239+
if strings.EqualFold(s, needle) {
240+
return true
241+
}
242+
}
243+
return false
244+
}

cmd/heal_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
package cmd
2+
3+
import (
4+
"testing"
5+
6+
"github.com/tronprotocol/tron-deployment/internal/diagnosis"
7+
)
8+
9+
// TestProposeHealAction pins the (check, current state) → action
10+
// mapping for `trond auto-heal`. Every case here is a contract:
11+
// adding a new mapping requires adding a case here so the table
12+
// stays the source of truth.
13+
func TestProposeHealAction(t *testing.T) {
14+
cases := []struct {
15+
name string
16+
check diagnosis.CheckResult
17+
nodeStatus string
18+
wantOK bool
19+
wantAction string
20+
}{
21+
{
22+
name: "port-listening-fail-stopped-node-can-be-started",
23+
check: diagnosis.CheckResult{
24+
Name: "port_listening",
25+
Status: diagnosis.StatusFail,
26+
},
27+
nodeStatus: "stopped",
28+
wantOK: true,
29+
wantAction: "start",
30+
},
31+
{
32+
name: "port-listening-fail-running-node-no-auto-fix",
33+
check: diagnosis.CheckResult{
34+
Name: "port_listening",
35+
Status: diagnosis.StatusFail,
36+
},
37+
// If state thinks the node is running but ports aren't
38+
// listening, auto-restart is risky (e.g. the container
39+
// may be in the middle of a long startup); surface to
40+
// human instead.
41+
nodeStatus: "running",
42+
wantOK: false,
43+
},
44+
{
45+
name: "sync-progress-fail-no-auto-fix",
46+
check: diagnosis.CheckResult{
47+
Name: "sync_progress",
48+
Status: diagnosis.StatusFail,
49+
},
50+
nodeStatus: "running",
51+
wantOK: false,
52+
},
53+
{
54+
name: "peer-count-fail-no-auto-fix",
55+
check: diagnosis.CheckResult{
56+
Name: "peer_count",
57+
Status: diagnosis.StatusFail,
58+
},
59+
nodeStatus: "running",
60+
wantOK: false,
61+
},
62+
{
63+
name: "disk-space-fail-no-auto-fix",
64+
check: diagnosis.CheckResult{
65+
Name: "disk_space",
66+
Status: diagnosis.StatusFail,
67+
},
68+
nodeStatus: "running",
69+
wantOK: false,
70+
},
71+
}
72+
73+
for _, tc := range cases {
74+
t.Run(tc.name, func(t *testing.T) {
75+
got, ok := proposeHealAction(tc.check, tc.nodeStatus)
76+
if ok != tc.wantOK {
77+
t.Errorf("ok: got %v, want %v", ok, tc.wantOK)
78+
}
79+
if tc.wantOK && got.Action != tc.wantAction {
80+
t.Errorf("action: got %q, want %q", got.Action, tc.wantAction)
81+
}
82+
})
83+
}
84+
}

cmd/schema_coverage_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ func TestSchemaCoverage(t *testing.T) {
7272
// lookup used by `trond schema`. Keep in sync with cmd/schema.go.
7373
lookup := map[string]string{
7474
"trond apply": "apply",
75+
"trond auto-heal": "auto-heal",
7576
"trond config validate": "config-validate",
7677
"trond config render": "config-render",
7778
"trond config diff": "config-diff",

internal/mcp/conf_helpers.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package mcp
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"github.com/tronprotocol/tron-deployment/internal/state"
8+
"github.com/tronprotocol/tron-deployment/internal/target"
9+
)
10+
11+
// readLiveConfigForMCP returns the bytes of the conf file currently
12+
// in use by the running node, regardless of runtime. Shared by
13+
// resources.go (trond://nodes/<name>/conf) and the future
14+
// verify_config tool.
15+
func readLiveConfigForMCP(ctx context.Context, tgt target.Target, node *state.ManagedNode) (string, error) {
16+
if node.Runtime == "jar" {
17+
out, err := tgt.Exec(ctx, "cat", node.InstallPath+"/conf/"+node.Name+".conf")
18+
if err != nil {
19+
return "", fmt.Errorf("read jar conf: %w", err)
20+
}
21+
return string(out), nil
22+
}
23+
out, err := tgt.Exec(ctx, "docker", "exec", node.Name, "cat",
24+
"/java-tron/conf/"+node.Name+".conf")
25+
if err != nil {
26+
return "", fmt.Errorf("docker exec cat: %w", err)
27+
}
28+
return string(out), nil
29+
}

0 commit comments

Comments
 (0)