LAA-Software-Engineering · leo-aa88 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ### Added
 
+- **Run checkpointing and resume** (issue #105): SQLite `run_checkpoints` table stores per-run execution snapshots after each completed step. `agentctl run --resume <run-id>` rehydrates interpolation context and continues from the next step without replaying earlier steps. Interrupted runs exit cleanly (status `interrupted`, exit code 0) and cascade with trace retention pruning. Checkpoints are written before step rows are marked succeeded to avoid replay on crash; runs pin `workflow_spec_hash` and `environment_name` for safe resume.
 - **Built-in policy presets** (issue #104): `strict`, `permissive`, and `shell_safe`. Select via `Project.spec.defaults.policy`, by referencing a preset name on agents/workflows, or with `Policy.spec.preset` (local rules layer on top). Presets expand during [NormalizeProjectGraph]; `strict`/`permissive` materialize approval flags, while `shell_safe` sets `ResolvedPreset` and relies on runtime token classification plus tool safety metadata for plan risk.
 - **`shell_safe` token classification** for native `command.run` / `run` / `exec` / `shell` operations: read-only first tokens (`ls`, `cat`, …) run unattended when the command contains no shell metacharacters (`;|&$`, newlines, `` ` ``, `$(…)`); risky tokens, unknown tokens, and side-effecting non-shell tools require `--approve`. **Heuristic only — not a sandbox.**
 - **`spec.safety` on Tool resources** (issue #103): optional `trusted`, `sideEffects`, and `requiresApproval` fields. [NormalizeProjectGraph] materializes fail-closed defaults on load.

@@ -3,11 +3,13 @@ package cli
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/engine"
 	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/policy"
 	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/render"
 	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/runtime"
@@ -21,6 +23,7 @@ func newRunCmd() *cobra.Command {
 	var inputFile string
 	var inputPairs []string
 	var approves []string
+	var resumeRunID string
 
 	cmd := &cobra.Command{
 		Use:          "run workflow/<name>",
@@ -33,24 +36,48 @@ Workflow input is built from optional --input-file (JSON object) plus repeated -
 (string values only for key=value pairs). Policy-gated tool uses can be allowed with repeated
 --approve using the full uses string (e.g. tool.helper.echo).
 
+Resume an interrupted or incomplete run with --resume <run-id> (no workflow argument).
+
 Examples:
   agentctl run workflow/demo --input topic=hello
   agentctl run workflow/demo --input-file input.json
+  agentctl run --resume run-abc123
 
 Exit codes (section 11.2):
-  0 — success
+  0 — success (including interrupted runs awaiting resume)
   1 — generic failure (e.g. cannot open SQLite, start run, trace)
   2 — validation failure (project, workflow ref, input, input-file)
   4 — execution failure (step/engine error after the run row exists)
   5 — policy denial`,
-		Args: cobra.ExactArgs(1),
+		Args: func(cmd *cobra.Command, args []string) error {
+			resume, _ := cmd.Flags().GetString("resume")
+			if strings.TrimSpace(resume) != "" {
+				if len(args) != 0 {
+					return NewExitError(ExitValidationError, fmt.Errorf("run: --resume does not take a workflow argument"))
+				}
+				return nil
+			}
+			if len(args) != 1 {
+				return NewExitError(ExitValidationError, fmt.Errorf("run: requires workflow/<name> or --resume <run-id>"))
+			}
+			return nil
+		},
 		RunE: func(cmd *cobra.Command, args []string) error {
-			return runRun(cmd, args[0], inputFile, inputPairs, approves)
+			var wfName string
+			if len(args) == 1 {
+				var err error
+				wfName, err = parseWorkflowTarget(args[0])
+				if err != nil {
+					return NewExitError(ExitValidationError, err)
+				}
+			}
+			return runRun(cmd, wfName, resumeRunID, inputFile, inputPairs, approves)
 		},
 	}
 	cmd.Flags().StringVar(&inputFile, "input-file", "", "path to JSON file with workflow input object")
 	cmd.Flags().StringArrayVar(&inputPairs, "input", nil, "workflow input as key=value (repeatable; values are strings)")
 	cmd.Flags().StringArrayVar(&approves, "approve", nil, "approve a policy-gated tool uses string (repeatable)")
+	cmd.Flags().StringVar(&resumeRunID, "resume", "", "resume an interrupted or incomplete run by id")
 	return cmd
 }
 
@@ -107,6 +134,9 @@ func classifyRunError(err error) int {
 	if err == nil {
 		return ExitSuccess
 	}
+	if errors.Is(err, engine.ErrInterrupted) {
+		return ExitSuccess
+	}
 	if _, ok := policy.AsDenied(err); ok {
 		return ExitPolicyDenied
 	}
@@ -118,35 +148,43 @@ func classifyRunError(err error) int {
 		strings.Contains(msg, "invalid input JSON"),
 		strings.Contains(msg, "workflow input"),
 		strings.Contains(msg, "marshal workflow input"),
-		strings.Contains(msg, "unknown environment"):
+		strings.Contains(msg, "unknown environment"),
+		strings.Contains(msg, "workflow spec changed"),
+		strings.Contains(msg, "does not match run"):
 		return ExitValidationError
 	case strings.Contains(msg, "open sqlite"),
 		strings.Contains(msg, "ping sqlite"),
 		strings.Contains(msg, "start run:"),
-		strings.Contains(msg, "trace run."):
+		strings.Contains(msg, "trace run."),
+		strings.Contains(msg, "not found"),
+		strings.Contains(msg, "has no checkpoint"),
+		strings.Contains(msg, "is not resumable"):
 		return ExitGenericFailure
 	default:
 		return ExitExecutionError
 	}
 }
 
-func runRun(cmd *cobra.Command, target, inputFile string, inputPairs, approves []string) error {
+func runRun(cmd *cobra.Command, wfName, resumeRunID, inputFile string, inputPairs, approves []string) error {
 	ctx := context.Background()
 	g := Globals()
 
-	wfName, err := parseWorkflowTarget(target)
-	if err != nil {
-		return NewExitError(ExitValidationError, err)
+	resumeID := strings.TrimSpace(resumeRunID)
+	if resumeID == "" && wfName == "" {
+		return NewExitError(ExitValidationError, fmt.Errorf("run: requires workflow/<name> or --resume <run-id>"))
 	}
 
 	graph, root, err := prepareProjectGraph(g.ProjectRoot, g)
 	if err != nil {
 		return NewExitError(ExitValidationError, err)
 	}
 
-	inputJSON, err := buildRunInputJSON(inputFile, inputPairs)
-	if err != nil {
-		return NewExitError(ExitValidationError, err)
+	var inputJSON []byte
+	if resumeID == "" {
+		inputJSON, err = buildRunInputJSON(inputFile, inputPairs)
+		if err != nil {
+			return NewExitError(ExitValidationError, err)
+		}
 	}
 
 	env := planEnvironment(g)
@@ -165,15 +203,27 @@ func runRun(cmd *cobra.Command, target, inputFile string, inputPairs, approves [
 	defer func() { _ = st.Close() }()
 
 	rt := local.NewRuntime(root, st)
-	runID, runErr := rt.ExecuteWorkflow(ctx, runtime.WorkflowRunOptions{
-		WorkflowName:    wfName,
+	opts := runtime.WorkflowRunOptions{
 		EnvironmentName: strings.TrimSpace(g.Env),
 		Env:             env,
 		InputJSON:       inputJSON,
 		ApprovedActions: approves,
-	})
+		Resume:          resumeID != "",
+		RunID:           resumeID,
+	}
+	if !opts.Resume {
+		opts.WorkflowName = wfName
+	}
+	runID, runErr := rt.ExecuteWorkflow(ctx, opts)
+
+	outWfName := wfName
+	if opts.Resume && runID != "" {
+		if r, gerr := st.GetRun(ctx, runID); gerr == nil && r != nil {
+			outWfName = r.WorkflowName
+		}
+	}
 
-	if werr := writeRunOutput(cmd, ctx, st, env, dsn, wfName, runID, runErr, g); werr != nil {
+	if werr := writeRunOutput(cmd, ctx, st, env, dsn, outWfName, runID, runErr, g); werr != nil {
 		return werr
 	}
 	if runErr != nil {

@@ -2,11 +2,26 @@ package cli
 
 import (
 	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
+
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/engine"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/models"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/plan"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/project"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/runtime/local"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/spec"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/state"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/state/sqlite"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/tools"
+	"github.com/LAA-Software-Engineering/agentic-control-plane/internal/trace"
 )
 
 func runProjRoot(t *testing.T) string {
@@ -220,3 +235,126 @@ func TestRun_inputFile_succeeds(t *testing.T) {
 		t.Fatal(out.String())
 	}
 }
+
+func TestRun_resume_missingRun_exit1(t *testing.T) {
+	db := filepath.Join(t.TempDir(), "resume-missing.db")
+	root := runProjRoot(t)
+
+	ResetGlobalsForTest()
+	var out bytes.Buffer
+	cmd := NewRootCmd()
+	cmd.SetOut(&out)
+	cmd.SetErr(&out)
+	cmd.SetArgs([]string{
+		"run", "--resume", "does-not-exist",
+		"--project", root,
+		"--state", db,
+	})
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	if ExitCodeOf(err) != ExitGenericFailure {
+		t.Fatalf("exit=%d err=%v out=%s", ExitCodeOf(err), err, out.String())
+	}
+}
+
+func TestRun_resume_withWorkflowArg_exit2(t *testing.T) {
+	db := filepath.Join(t.TempDir(), "resume-bad-args.db")
+	root := runProjRoot(t)
+
+	ResetGlobalsForTest()
+	cmd := NewRootCmd()
+	cmd.SetArgs([]string{
+		"run", "workflow/demo", "--resume", "some-id",
+		"--project", root,
+		"--state", db,
+	})
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	if ExitCodeOf(err) != ExitValidationError {
+		t.Fatalf("exit=%d err=%v", ExitCodeOf(err), err)
+	}
+}
+
+func TestRun_resume_happyPath(t *testing.T) {
+	ctx := context.Background()
+	db := filepath.Join(t.TempDir(), "resume-happy.db")
+	root := runProjRoot(t)
+
+	st, err := sqlite.Open(ctx, db)
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = st.Close() })
+
+	graph, err := project.LoadProject(root)
+	if err != nil {
+		t.Fatal(err)
+	}
+	spec.NormalizeProjectGraph(graph)
+	graph, err = local.ApplyEnvironment(graph, "staging")
+	if err != nil {
+		t.Fatal(err)
+	}
+	wf := graph.Workflows["demo"]
+	wfHash, err := plan.WorkflowSpecHash(wf)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	runID := "cli-resume-1"
+	started := time.Date(2026, 6, 1, 12, 0, 0, 0, time.UTC)
+	if err := st.StartRun(ctx, state.Run{
+		RunID: runID, WorkflowName: "demo", Env: "dev", Status: state.RunStatusRunning,
+		StartedAt: started, InputJSON: `{"topic":"cli-resume"}`, TotalCostUSD: 0,
+		WorkflowSpecHash: wfHash, EnvironmentName: "staging",
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	var input map[string]any
+	if err := json.Unmarshal([]byte(`{"topic":"cli-resume"}`), &input); err != nil {
+		t.Fatal(err)
+	}
+	idx := 0
+	ex := &engine.Executor{
+		Graph: graph, ProjectRoot: root,
+		Tools: tools.NewRegistry(graph), Models: models.NewRegistry(graph),
+		Store: st, Trace: trace.NewRecorder(st),
+		Now: func() time.Time { return started },
+	}
+	if err := ex.Run(ctx, engine.RunInput{
+		RunID: runID, WorkflowName: "demo", Env: "dev", StartedAt: started, Input: input,
+		InterruptAfterStepIndex: &idx,
+	}); !errors.Is(err, engine.ErrInterrupted) {
+		t.Fatalf("interrupt: %v", err)
+	}
+
+	ResetGlobalsForTest()
+	var out bytes.Buffer
+	cmd := NewRootCmd()
+	cmd.SetOut(&out)
+	cmd.SetErr(&out)
+	cmd.SetArgs([]string{
+		"run", "--resume", runID,
+		"--project", root,
+		"-e", "staging",
+		"--state", db,
+	})
+	if err := cmd.Execute(); err != nil {
+		t.Fatalf("resume: %v\n%s", err, out.String())
+	}
+	if !strings.Contains(out.String(), "succeeded") {
+		t.Fatalf("output:\n%s", out.String())
+	}
+	got, err := st.GetRun(ctx, runID)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got.Status != state.RunStatusSucceeded {
+		t.Fatalf("status %q", got.Status)
+	}
+}