tronprotocol
diff --git a/‎cmd/heal.go‎
Lines changed: 244 additions & 0 deletions b/‎cmd/heal.go‎
Lines changed: 244 additions & 0 deletions
diff --git a/‎cmd/heal_test.go‎
Lines changed: 84 additions & 0 deletions b/‎cmd/heal_test.go‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎cmd/schema_coverage_test.go‎
Lines changed: 1 addition & 0 deletions b/‎cmd/schema_coverage_test.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/mcp/conf_helpers.go‎
Lines changed: 29 additions & 0 deletions b/‎internal/mcp/conf_helpers.go‎
Lines changed: 29 additions & 0 deletions
@@ -0,0 +1,244 @@
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/spf13/cobra"
+
+	"github.com/tronprotocol/tron-deployment/internal/diagnosis"
+	"github.com/tronprotocol/tron-deployment/internal/output"
+)
+
+// `trond heal <node>` is the auto-fix counterpart to `trond diagnose`.
+// It runs the same diagnostic suite and, for each failed check whose
+// remediation is *known and safe*, applies the fix automatically.
+// Anything destructive (remove, network destroy, --force) stays in
+// human hands.
+//
+// Conservative by design: a v1 healer that auto-restarts on every
+// fail signal would run away — sync_progress=fail just means "wait
+// longer," not "restart." We map specific (check, current state)
+// tuples to specific actions and explicitly skip the rest, surfacing
+// suggestions[] so the operator can pick up where heal stopped.
+//
+// Output schema: schemas/output/heal.schema.json.
+//
+// Idempotent: safe to re-run. If the previous run fixed everything,
+// the next one returns healed=[] / skipped=[] / still_failing=[].
+
+var (
+	healDryRun bool
+	healOnly   []string // restrict to specific check names; defaults = all
+)
+
+var autoHealCmd = &cobra.Command{
+	Use:   "auto-heal <node>",
+	Short: "Run diagnose, then auto-fix the failures whose remediation is known + safe",
+	Long: `Heal walks trond's diagnose output and attempts the documented
+remediation for each fail. Read-only inspection (status, diagnose);
+destructive actions stay behind a HUMAN_REQUIRED gate.
+
+Currently auto-fixable:
+  port_listening=fail and node.status=stopped → trond start <node>
+
+Surfaced for human action (heal does NOT touch them):
+  sync_progress=fail   — node is alive, just behind. Wait.
+  peer_count=fail      — recovers on its own when peers come back.
+  disk_space=fail      — needs operator attention.
+  memory_usage=fail    — needs operator attention.
+
+Use --dry-run to see what heal would do without acting.`,
+	Args: cobra.ExactArgs(1),
+	RunE: runAutoHeal,
+}
+
+func init() {
+	autoHealCmd.Flags().BoolVar(&healDryRun, "dry-run", false,
+		"Print proposed actions without executing them")
+	autoHealCmd.Flags().StringSliceVar(&healOnly, "only", nil,
+		"Comma-separated check names to consider; default = all checks")
+	rootCmd.AddCommand(autoHealCmd)
+}
+
+// healAction is one auto-fix attempt: which check triggered it,
+// what action we ran, and whether it succeeded.
+type healAction struct {
+	Check   string `json:"check"`
+	Action  string `json:"action"`
+	Result  string `json:"result"` // succeeded | failed | dry_run
+	Message string `json:"message,omitempty"`
+}
+
+// healSkip records a fail check we intentionally didn't auto-fix
+// (with the reason). Surfaces the suggestions[] so the operator
+// has the same context the agent would.
+type healSkip struct {
+	Check       string   `json:"check"`
+	Reason      string   `json:"reason"`
+	Suggestions []string `json:"suggestions,omitempty"`
+}
+
+func runAutoHeal(cmd *cobra.Command, args []string) error {
+	name := args[0]
+	start := time.Now()
+
+	nc, err := resolveNodeContext(name)
+	if err != nil {
+		return err
+	}
+	defer nc.Close()
+
+	// Run the same checker matrix `trond diagnose` does.
+	opts := diagnosis.CheckOpts{
+		NodeName:    nc.Node.Name,
+		NodeType:    "", // diagnose doesn't read this from state today; checkers cope
+		Runtime:     nc.Node.Runtime,
+		HTTPPort:    nc.Node.HTTPPort,
+		GRPCPort:    nc.Node.GRPCPort,
+		InstallPath: nc.Node.InstallPath,
+	}
+	ctx, cancel := context.WithTimeout(cmd.Context(), 30*time.Second)
+	defer cancel()
+	checkers := diagnosis.AllCheckers()
+	results := make([]diagnosis.CheckResult, 0, len(checkers))
+	for _, c := range checkers {
+		if len(healOnly) > 0 && !contains(healOnly, c.Name()) {
+			continue
+		}
+		results = append(results, c.Run(ctx, nc.Target, opts))
+	}
+
+	var (
+		healed       []healAction
+		skipped      []healSkip
+		stillFailing []diagnosis.CheckResult
+	)
+
+	for _, r := range results {
+		if r.Status != diagnosis.StatusFail {
+			continue
+		}
+		action, ok := proposeHealAction(r, nc.Node.Status)
+		if !ok {
+			skipped = append(skipped, healSkip{
+				Check:       r.Name,
+				Reason:      "no auto-fix mapped (manual remediation required)",
+				Suggestions: r.Suggestions,
+			})
+			stillFailing = append(stillFailing, r)
+			continue
+		}
+		if healDryRun {
+			healed = append(healed, healAction{
+				Check:   r.Name,
+				Action:  action.Action,
+				Result:  "dry_run",
+				Message: action.Message,
+			})
+			continue
+		}
+		if err := executeHealAction(cmd.Context(), nc, action); err != nil {
+			healed = append(healed, healAction{
+				Check:   r.Name,
+				Action:  action.Action,
+				Result:  "failed",
+				Message: err.Error(),
+			})
+			stillFailing = append(stillFailing, r)
+			continue
+		}
+		healed = append(healed, healAction{
+			Check:   r.Name,
+			Action:  action.Action,
+			Result:  "succeeded",
+			Message: action.Message,
+		})
+	}
+
+	result := map[string]any{
+		"name":          name,
+		"healed":        healed,
+		"skipped":       skipped,
+		"still_failing": stillFailing,
+		"duration_ms":   time.Since(start).Milliseconds(),
+		"dry_run":       healDryRun,
+	}
+
+	outputFmt, _ := cmd.Flags().GetString("output")
+	if outputFmt == "json" {
+		return output.WriteJSON(os.Stdout, result)
+	}
+	if len(healed) == 0 && len(skipped) == 0 {
+		fmt.Printf("✓ %s: no failed checks; nothing to heal.\n", name)
+		return nil
+	}
+	for _, h := range healed {
+		fmt.Printf("[%s] %s → %s: %s\n", h.Result, h.Check, h.Action, h.Message)
+	}
+	for _, s := range skipped {
+		fmt.Printf("[skipped] %s: %s\n", s.Check, s.Reason)
+		for _, sg := range s.Suggestions {
+			fmt.Printf("    - %s\n", sg)
+		}
+	}
+	return nil
+}
+
+// proposeHealAction maps (check.Name, current state) tuples to a
+// concrete fix. Returns ok=false when no automatic remediation is
+// safe — the caller surfaces suggestions[] instead.
+//
+// Adding a new auto-fix means landing both:
+//
+//  1. A case here that returns ok=true plus a healAction definition.
+//  2. A test in cmd/heal_test.go pinning the (check, state) tuple
+//     so the mapping doesn't silently drift.
+func proposeHealAction(r diagnosis.CheckResult, nodeStatus string) (proposedAction, bool) {
+	// Switch (rather than if-else) so future cases land cleanly:
+	// each (check, state) tuple gets one case + one test row.
+	//nolint:gocritic // single-case today; will grow per the package doc above.
+	switch {
+	case r.Name == "port_listening" && nodeStatus == "stopped":
+		return proposedAction{
+			Action:  "start",
+			Message: "node was marked stopped in state; bringing it back up",
+		}, true
+	}
+	return proposedAction{}, false
+}
+
+type proposedAction struct {
+	Action  string // "start" | "restart" | future actions
+	Message string
+}
+
+// executeHealAction runs the proposed action against the node. We
+// reuse the existing trond commands' machinery rather than calling
+// docker / systemd directly so audit logs + state updates flow
+// through the same path a manual `trond start` would.
+func executeHealAction(ctx context.Context, nc *nodeContext, action proposedAction) error {
+	if action.Action == "start" {
+		// Mirror cmd/start.go without going through cobra. The
+		// runtime's Start handles docker compose start / systemctl
+		// start uniformly.
+		if err := nc.Runtime.Start(ctx, nc.Node.Name); err != nil {
+			return err
+		}
+		nc.Node.Status = "running"
+		return nc.SaveState()
+	}
+	return fmt.Errorf("unknown heal action %q", action.Action)
+}
+
+func contains(haystack []string, needle string) bool {
+	for _, s := range haystack {
+		if strings.EqualFold(s, needle) {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,84 @@
+package cmd
+
+import (
+	"testing"
+
+	"github.com/tronprotocol/tron-deployment/internal/diagnosis"
+)
+
+// TestProposeHealAction pins the (check, current state) → action
+// mapping for `trond auto-heal`. Every case here is a contract:
+// adding a new mapping requires adding a case here so the table
+// stays the source of truth.
+func TestProposeHealAction(t *testing.T) {
+	cases := []struct {
+		name       string
+		check      diagnosis.CheckResult
+		nodeStatus string
+		wantOK     bool
+		wantAction string
+	}{
+		{
+			name: "port-listening-fail-stopped-node-can-be-started",
+			check: diagnosis.CheckResult{
+				Name:   "port_listening",
+				Status: diagnosis.StatusFail,
+			},
+			nodeStatus: "stopped",
+			wantOK:     true,
+			wantAction: "start",
+		},
+		{
+			name: "port-listening-fail-running-node-no-auto-fix",
+			check: diagnosis.CheckResult{
+				Name:   "port_listening",
+				Status: diagnosis.StatusFail,
+			},
+			// If state thinks the node is running but ports aren't
+			// listening, auto-restart is risky (e.g. the container
+			// may be in the middle of a long startup); surface to
+			// human instead.
+			nodeStatus: "running",
+			wantOK:     false,
+		},
+		{
+			name: "sync-progress-fail-no-auto-fix",
+			check: diagnosis.CheckResult{
+				Name:   "sync_progress",
+				Status: diagnosis.StatusFail,
+			},
+			nodeStatus: "running",
+			wantOK:     false,
+		},
+		{
+			name: "peer-count-fail-no-auto-fix",
+			check: diagnosis.CheckResult{
+				Name:   "peer_count",
+				Status: diagnosis.StatusFail,
+			},
+			nodeStatus: "running",
+			wantOK:     false,
+		},
+		{
+			name: "disk-space-fail-no-auto-fix",
+			check: diagnosis.CheckResult{
+				Name:   "disk_space",
+				Status: diagnosis.StatusFail,
+			},
+			nodeStatus: "running",
+			wantOK:     false,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, ok := proposeHealAction(tc.check, tc.nodeStatus)
+			if ok != tc.wantOK {
+				t.Errorf("ok: got %v, want %v", ok, tc.wantOK)
+			}
+			if tc.wantOK && got.Action != tc.wantAction {
+				t.Errorf("action: got %q, want %q", got.Action, tc.wantAction)
+			}
+		})
+	}
+}
@@ -72,6 +72,7 @@ func TestSchemaCoverage(t *testing.T) {
 	// lookup used by `trond schema`. Keep in sync with cmd/schema.go.
 	lookup := map[string]string{
 		"trond apply":             "apply",
+		"trond auto-heal":         "auto-heal",
 		"trond config validate":   "config-validate",
 		"trond config render":     "config-render",
 		"trond config diff":       "config-diff",
 
@@ -0,0 +1,29 @@
+package mcp
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/tronprotocol/tron-deployment/internal/state"
+	"github.com/tronprotocol/tron-deployment/internal/target"
+)
+
+// readLiveConfigForMCP returns the bytes of the conf file currently
+// in use by the running node, regardless of runtime. Shared by
+// resources.go (trond://nodes/<name>/conf) and the future
+// verify_config tool.
+func readLiveConfigForMCP(ctx context.Context, tgt target.Target, node *state.ManagedNode) (string, error) {
+	if node.Runtime == "jar" {
+		out, err := tgt.Exec(ctx, "cat", node.InstallPath+"/conf/"+node.Name+".conf")
+		if err != nil {
+			return "", fmt.Errorf("read jar conf: %w", err)
+		}
+		return string(out), nil
+	}
+	out, err := tgt.Exec(ctx, "docker", "exec", node.Name, "cat",
+		"/java-tron/conf/"+node.Name+".conf")
+	if err != nil {
+		return "", fmt.Errorf("docker exec cat: %w", err)
+	}
+	return string(out), nil
+}