From 500216aaa229f82e5e0ea62260183be496f54716 Mon Sep 17 00:00:00 2001 From: Nick Masluk Date: Tue, 26 May 2026 20:27:12 +0000 Subject: [PATCH 1/5] feat: observational mode for live-system debugging campaigns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add target_system.observational flag so campaigns whose target is a live system (cluster, service, dataset) can use repo_path purely to grant the agent shell access — without per-iteration git worktree isolation. When observational=true: - run_iteration skips create_experiment_worktree and runs the executor directly in repo_path. Prevents the FileNotFoundError "Not a git repository" failure mode and avoids polluting a non-code target with per-iteration orphan branches and .nous-experiments/ subdirs. - The design and execute_analyze prompts swap their worktree paragraphs for observational equivalents via {{execution_environment}} and {{worktree_constraint}} placeholders, so the agent is told it is probing a live target rather than mutating an isolated worktree. Default behavior is unchanged — the flag is opt-in and the worktree path remains the default for code-evolution campaigns. Tested: 10 new tests + 337 existing tests pass. --- orchestrator/iteration.py | 11 +- orchestrator/llm_dispatch.py | 40 ++++ prompts/methodology/design.md | 2 +- prompts/methodology/execute_analyze.md | 2 +- tests/test_observational.py | 244 +++++++++++++++++++++++++ 5 files changed, 296 insertions(+), 3 deletions(-) create mode 100644 tests/test_observational.py diff --git a/orchestrator/iteration.py b/orchestrator/iteration.py index 29e9712..e1ede52 100644 --- a/orchestrator/iteration.py +++ b/orchestrator/iteration.py @@ -370,7 +370,10 @@ def _max_turns_for(phase_key: str) -> int: cli_dispatcher.model = _model_for("execute_analyze") cli_dispatcher.max_turns = _max_turns_for("execute_analyze") exec_dispatcher = cli_dispatcher or llm_dispatcher - if repo_path: + observational = bool( + campaign.get("target_system", {}).get("observational", False) + ) + if repo_path and not observational: from orchestrator.worktree import ( create_experiment_worktree, remove_experiment_worktree, @@ -380,6 +383,12 @@ def _max_turns_for(phase_key: str) -> int: ) (iter_dir / ".experiment_id").write_text(experiment_id) print(f" Experiment worktree: {experiment_dir}") + elif repo_path: + # Observational mode: executor runs directly in repo_path. The + # target system is live (cluster, service, dataset) and there is + # nothing to isolate — bundles must contain no code_changes arms. + experiment_dir = Path(repo_path) + print(f" Observational mode: executor runs in {experiment_dir}") if cli_dispatcher: import contextlib ctx = cli_dispatcher.override_cwd(experiment_dir) if experiment_dir else contextlib.nullcontext() diff --git a/orchestrator/llm_dispatch.py b/orchestrator/llm_dispatch.py index d4f4ece..fbda775 100644 --- a/orchestrator/llm_dispatch.py +++ b/orchestrator/llm_dispatch.py @@ -35,6 +35,38 @@ # Schema cache: schema_name -> parsed schema dict _schema_cache: dict[str, dict] = {} +# Prompt fragments that swap based on target_system.observational. Worktree +# mode is the default — code-evolution campaigns get an isolated git worktree +# per iteration. Observational mode is for live targets (clusters, services, +# datasets) that the executor probes without per-iteration code mutation. +_WORKTREE_EXECUTION_ENV = ( + "You are running inside an isolated git worktree of the target system. " + "You own this worktree — reset it yourself with `git checkout -- .` " + "between conditions." +) +_OBSERVATIONAL_EXECUTION_ENV = ( + "You are running directly in the target system's working directory. " + "There is no per-iteration git isolation: this campaign is observational, " + "and your bundle must contain no `code_changes` arms. Do not mutate the " + "target system's persistent state — your job is to probe, measure, and " + "report. Treat any files you create as scratch artifacts that belong " + "under `{{iter_dir}}/inputs/` or `{{iter_dir}}/results/`, not in the " + "target directory." +) +_WORKTREE_DESIGN_CONSTRAINT = ( + "**Worktree isolation assumed.** The executor runs in a clean git " + "worktree. Each condition starts from clean state (`git checkout -- .` " + "runs between conditions). Design your experimental conditions assuming " + "this — don't include manual cleanup steps." +) +_OBSERVATIONAL_DESIGN_CONSTRAINT = ( + "**Observational campaign.** The executor runs directly against a live " + "target system — no git worktree, no code-change arms. All arms must be " + "pure observations of system state (probes, metrics, log scrapes). Do " + "not include `code_changes` in any arm; do not assume mutation is " + "possible without explicit consent gates." +) + class LLMDispatcher: """Dispatch agent roles to an LLM and produce schema-conformant artifacts.""" @@ -107,6 +139,11 @@ def _validate_campaign(campaign: dict) -> None: f"Campaign 'target_system.{field}' must be a list of strings. " f"Got: {val!r}" ) + if "observational" in ts and not isinstance(ts["observational"], bool): + raise ValueError( + f"Campaign 'target_system.observational' must be a bool. " + f"Got: {ts['observational']!r}" + ) # ------------------------------------------------------------------ # Public interface (satisfies Dispatcher protocol) @@ -212,6 +249,7 @@ def _build_context( perspective: str | None, ) -> dict[str, str]: ts = self.campaign["target_system"] + observational = bool(ts.get("observational", False)) ctx: dict[str, str] = { "target_system": ts["name"], "system_description": ts["description"], @@ -219,6 +257,8 @@ def _build_context( "controllable_knobs": ", ".join(ts["controllable_knobs"]) if ts.get("controllable_knobs") else "Not specified — planner should discover from code", "active_principles": self._format_principles(), "iteration": str(iteration), + "execution_environment": _OBSERVATIONAL_EXECUTION_ENV if observational else _WORKTREE_EXECUTION_ENV, + "worktree_constraint": _OBSERVATIONAL_DESIGN_CONSTRAINT if observational else _WORKTREE_DESIGN_CONSTRAINT, } if phase == "design": diff --git a/prompts/methodology/design.md b/prompts/methodology/design.md index 8a99173..4fb7ae0 100644 --- a/prompts/methodology/design.md +++ b/prompts/methodology/design.md @@ -158,7 +158,7 @@ Now design a hypothesis bundle based on what you actually observed and verified: - Predictions must be directional, falsifiable, and reference specific observable metrics. Do not invent arbitrary numeric thresholds unless campaign.yaml specifies them. - Base all experiment parameters on verified system behavior — if you didn't probe it, don't assume it. - **No `sed`/`awk` for code changes.** When describing code modifications in problem framing or bundle arms, describe the *intent* (what to change and why). The executor agent will implement changes properly via file edits, verify they compile, and create reusable `git diff` patches. Never suggest inline shell regex as an implementation strategy. -- **Worktree isolation assumed.** The executor runs in a clean git worktree. Each condition starts from clean state (`git checkout -- .` runs between conditions). Design your experimental conditions assuming this — don't include manual cleanup steps. +- {{worktree_constraint}} ## Output — Write Files Directly diff --git a/prompts/methodology/execute_analyze.md b/prompts/methodology/execute_analyze.md index 008310a..3bc0970 100644 --- a/prompts/methodology/execute_analyze.md +++ b/prompts/methodology/execute_analyze.md @@ -1,6 +1,6 @@ You are a scientific executor for the Nous hypothesis-driven experimentation framework. -You have **shell access**. You are running inside an isolated git worktree of the target system. You own this worktree — reset it yourself with `git checkout -- .` between conditions. +You have **shell access**. {{execution_environment}} Your job has FIVE phases — all in one session with full context: 1. **Prepare** — build, create patches, validate ALL commands diff --git a/tests/test_observational.py b/tests/test_observational.py new file mode 100644 index 0000000..4e28005 --- /dev/null +++ b/tests/test_observational.py @@ -0,0 +1,244 @@ +"""Tests for observational mode — campaigns where the executor probes a live +target system instead of evolving code in a git worktree. +""" +import contextlib +import json +import shutil +import warnings +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from orchestrator.dispatch import StubDispatcher +from orchestrator.engine import Engine +from orchestrator.iteration import IterationOutcome, run_iteration +from orchestrator.llm_dispatch import LLMDispatcher + + +class _CLIStub(StubDispatcher): + """StubDispatcher with the CLIDispatcher surface area iteration.py + needs (override_cwd context manager, model/max_turns attrs). + """ + + def __init__(self, work_dir, **_kw): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + super().__init__(work_dir) + self.model = "stub" + self.max_turns = 1 + + @contextlib.contextmanager + def override_cwd(self, _cwd): + yield + + +TEMPLATES_DIR = ( + Path(__file__).resolve().parent.parent / "orchestrator" / "templates" +) + + +def _campaign(observational: bool, repo_path: Path | None = None) -> dict: + target = { + "name": "TestSystem", + "description": "A live target with no code to evolve.", + "observable_metrics": ["latency_ms"], + "controllable_knobs": ["config"], + } + if observational: + target["observational"] = True + if repo_path is not None: + target["repo_path"] = str(repo_path) + return { + "research_question": "Does the live target behave?", + "target_system": target, + "prompts": { + "methodology_layer": "prompts/methodology", + "domain_adapter_layer": None, + }, + } + + +# --------------------------------------------------------------------------- +# _validate_campaign +# --------------------------------------------------------------------------- + + +class TestCampaignValidation: + def test_observational_true_accepted(self, tmp_path): + campaign = _campaign(observational=True) + # Must not raise. + LLMDispatcher._validate_campaign(campaign) + + def test_observational_false_accepted(self, tmp_path): + campaign = _campaign(observational=False) + LLMDispatcher._validate_campaign(campaign) + + def test_observational_omitted_accepted(self, tmp_path): + campaign = _campaign(observational=False) + assert "observational" not in campaign["target_system"] + LLMDispatcher._validate_campaign(campaign) + + def test_observational_non_bool_rejected(self): + campaign = _campaign(observational=False) + campaign["target_system"]["observational"] = "yes" + with pytest.raises(ValueError, match="observational.*must be a bool"): + LLMDispatcher._validate_campaign(campaign) + + +# --------------------------------------------------------------------------- +# _build_context — prompt fragment selection +# --------------------------------------------------------------------------- + + +class TestPromptFragmentSelection: + """The execution_environment and worktree_constraint placeholders swap + based on target_system.observational. The prompt loader will substitute + them into the design and execute_analyze templates. + """ + + def _dispatcher(self, tmp_path, observational: bool) -> LLMDispatcher: + # Seed the work_dir with the run_id only — no API key needed because + # _build_context never calls the LLM. + work_dir = tmp_path / "work" + work_dir.mkdir() + (work_dir / "runs" / "iter-1").mkdir(parents=True) + return LLMDispatcher( + work_dir=work_dir, + campaign=_campaign(observational=observational), + completion_fn=lambda **kw: None, + ) + + def test_default_is_worktree(self, tmp_path): + d = self._dispatcher(tmp_path, observational=False) + ctx = d._build_context("planner", "design", iteration=1, perspective=None) + assert "isolated git worktree" in ctx["execution_environment"] + assert "Worktree isolation assumed" in ctx["worktree_constraint"] + + def test_observational_swaps_text(self, tmp_path): + d = self._dispatcher(tmp_path, observational=True) + ctx = d._build_context("planner", "design", iteration=1, perspective=None) + assert "Observational" in ctx["worktree_constraint"] + assert "no per-iteration git isolation" in ctx["execution_environment"] + assert "git worktree" not in ctx["execution_environment"] + + def test_design_template_renders_with_observational_constraint(self, tmp_path): + """End-to-end: load the real design.md template with observational + context and confirm the worktree paragraph is replaced. + """ + d = self._dispatcher(tmp_path, observational=True) + ctx = d._build_context("planner", "design", iteration=1, perspective=None) + rendered = d.loader.load("design", ctx) + assert "Worktree isolation assumed" not in rendered + assert "Observational campaign" in rendered + + def test_execute_analyze_template_renders_with_observational_env(self, tmp_path): + """End-to-end: load execute_analyze.md with observational context. + The {{iter_dir}} placeholder inside the observational text must also + be replaced (loader does sequential substitution). + """ + d = self._dispatcher(tmp_path, observational=True) + # _build_context for execute-analyze needs a bundle.yaml and handoff.md + bundle_path = d.work_dir / "runs" / "iter-1" / "bundle.yaml" + bundle_path.write_text("metadata:\n iteration: 1\n") + (d.work_dir / "handoff.md").write_text("(stub handoff)") + (d.work_dir / "runs" / "iter-1" / "problem.md").write_text("(stub problem)") + ctx = d._build_context( + "executor", "execute-analyze", iteration=1, perspective=None, + ) + rendered = d.loader.load("execute_analyze", ctx) + assert "isolated git worktree" not in rendered + assert "no per-iteration git isolation" in rendered + assert "{{iter_dir}}" not in rendered # must be substituted + + +# --------------------------------------------------------------------------- +# Iteration loop: observational mode skips worktree creation +# --------------------------------------------------------------------------- + + +def _setup_observational_iteration( + tmp_path: Path, monkeypatch, *, repo_path: Path, +): + """Prepare a work_dir + campaign that uses observational mode and a + repo_path that is NOT a git repo. With the observational gate working, + run_iteration must complete without ever hitting create_experiment_worktree. + """ + work_dir = tmp_path / "work" + work_dir.mkdir() + for t in ("state.json", "ledger.json", "principles.json"): + shutil.copy(TEMPLATES_DIR / t, work_dir / t) + state = json.loads((work_dir / "state.json").read_text()) + state["run_id"] = "test" + (work_dir / "state.json").write_text(json.dumps(state, indent=2)) + + campaign = _campaign(observational=True, repo_path=repo_path) + + import orchestrator.iteration as ri + + def stub_factory(work_dir, campaign, model=None): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return StubDispatcher(work_dir) + + monkeypatch.setattr(ri, "LLMDispatcher", stub_factory) + # When repo_path is set, iteration.py would normally instantiate a + # CLIDispatcher. Replace it with a stub that exposes the same surface + # iteration.py touches (override_cwd, model, max_turns). + monkeypatch.setattr( + "orchestrator.cli_dispatch.CLIDispatcher", + lambda **kw: _CLIStub(kw["work_dir"]), + ) + monkeypatch.setattr( + ri, "HumanGate", + lambda: MagicMock(prompt=MagicMock(return_value=("approve", None))), + ) + return work_dir, campaign + + +class TestObservationalIterationFlow: + def test_runs_without_git_repo(self, tmp_path, monkeypatch): + """A non-git repo_path + observational=true must not raise + FileNotFoundError('Not a git repository') and must complete the + iteration. This is the regression for the magic.yaml campaign. + """ + repo = tmp_path / "live-target" + repo.mkdir() # NOT a git repo — no .git/ here. + + work_dir, campaign = _setup_observational_iteration( + tmp_path, monkeypatch, repo_path=repo, + ) + result = run_iteration(campaign, work_dir, iteration=1) + assert result == IterationOutcome.COMPLETED + assert Engine(work_dir).phase == "DONE" + + def test_no_experiment_worktree_created(self, tmp_path, monkeypatch): + repo = tmp_path / "live-target" + repo.mkdir() + work_dir, campaign = _setup_observational_iteration( + tmp_path, monkeypatch, repo_path=repo, + ) + + # Replace create_experiment_worktree with a sentinel that fails the + # test if it is ever called. The iteration import is local, so patch + # at the source module. + called = {"n": 0} + + def must_not_call(*a, **kw): + called["n"] += 1 + raise AssertionError( + "create_experiment_worktree must not be called in " + "observational mode" + ) + + monkeypatch.setattr( + "orchestrator.worktree.create_experiment_worktree", must_not_call, + ) + + run_iteration(campaign, work_dir, iteration=1) + + assert called["n"] == 0 + # No .experiment_id file should be written in observational mode. + assert not (work_dir / "runs" / "iter-1" / ".experiment_id").exists() + # No .nous-experiments/ directory should appear in the target. + assert not (repo / ".nous-experiments").exists() From 12ee020fd96dc229664f6bc0f656f4ec8e64b96c Mon Sep 17 00:00:00 2001 From: Nick Masluk Date: Tue, 26 May 2026 20:43:47 +0000 Subject: [PATCH 2/5] fix: allow target_system.observational in campaign schema The observational flag was wired into validation, prompts, and the iteration loop but the JSON schema still rejected it as an unknown property, so campaigns failed at load time. Co-Authored-By: Claude Opus 4.7 --- orchestrator/schemas/campaign.schema.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/orchestrator/schemas/campaign.schema.yaml b/orchestrator/schemas/campaign.schema.yaml index 4ca0be1..8ec8f85 100644 --- a/orchestrator/schemas/campaign.schema.yaml +++ b/orchestrator/schemas/campaign.schema.yaml @@ -53,6 +53,9 @@ properties: type: ["string", "null"] minLength: 1 description: "Path to target system git repo. Used by CLIDispatcher for code-access agents. If set, experiments run in isolated worktrees." + observational: + type: boolean + description: "If true, the executor runs directly in repo_path with no per-iteration git worktree. Use for live-system probing campaigns where there is no code to evolve. Bundles must contain no code_changes arms." models: type: object From d532fa247b461d7560f0fc7b6f9e48d9b04ce6ad Mon Sep 17 00:00:00 2001 From: Nick Masluk Date: Wed, 27 May 2026 12:49:31 +0000 Subject: [PATCH 3/5] review: address PR #220 review feedback - Fix prompt body / lead-paragraph contradiction in execute_analyze.md. The lead said "no per-iteration git isolation" in observational mode, but Phase 2 still hardcoded `git checkout -- .` between conditions (which would fail with no .git) and framed result-path warnings as "the worktree is temporary." Replace the reset step with a new {{condition_reset}} placeholder and rephrase the persistence note to be accurate in both modes. - Fix validation bypass: extract _validate_campaign to a module-level validate_campaign() and call it at the top of run_iteration. The staticmethod was only invoked from LLMDispatcher.__init__, so inline- agent mode (which never builds an LLMDispatcher) silently coerced non-bool observational values via bool() further down. - Add regression test that create_experiment_worktree IS called when observational=False (existing tests would all pass if the gate were inverted). - Loosen brittle prompt-text assertions: import the fragment constants and assert constant identity / containment instead of substrings, so copy-edits to the prompt text don't churn six tests. Co-Authored-By: Claude Opus 4.7 --- orchestrator/iteration.py | 7 ++ orchestrator/llm_dispatch.py | 76 ++++++++++------- prompts/methodology/execute_analyze.md | 6 +- tests/test_observational.py | 111 ++++++++++++++++++++----- 4 files changed, 145 insertions(+), 55 deletions(-) diff --git a/orchestrator/iteration.py b/orchestrator/iteration.py index e1ede52..4f8dcfc 100644 --- a/orchestrator/iteration.py +++ b/orchestrator/iteration.py @@ -255,6 +255,13 @@ def run_iteration( Returns: An IterationOutcome value: COMPLETED, CONTINUE, ABORTED, or REDESIGN. """ + # Validate the campaign once, up front. The staticmethod on LLMDispatcher + # is also called from its constructor, but inline-agent mode never builds + # an LLMDispatcher — without this call, a non-bool `observational` value + # would slip past validation and silently coerce via bool() below. + from orchestrator.llm_dispatch import validate_campaign + validate_campaign(campaign) + engine = Engine(work_dir) repo_path = campaign.get("target_system", {}).get("repo_path") diff --git a/orchestrator/llm_dispatch.py b/orchestrator/llm_dispatch.py index fbda775..f61696f 100644 --- a/orchestrator/llm_dispatch.py +++ b/orchestrator/llm_dispatch.py @@ -67,6 +67,50 @@ "possible without explicit consent gates." ) +# Per-condition reset step in execute_analyze.md Phase 2. Worktree mode resets +# tracked files between conditions; observational mode has no checkout to +# revert and instead reminds the agent not to mutate the live target. +_WORKTREE_CONDITION_RESET = "Reset worktree: `git checkout -- .`" +_OBSERVATIONAL_CONDITION_RESET = ( + "Do not mutate the target system between conditions. Any files you " + "wrote to the target directory during the previous condition must be " + "removed before the next one runs (this is your responsibility — " + "there is no automatic checkout)." +) + + +def validate_campaign(campaign: dict) -> None: + """Validate campaign config. Module-level so it can be called before any + dispatcher is constructed (e.g., from `run_iteration` in inline-agent mode, + where no LLMDispatcher is built and the staticmethod path is never taken). + """ + ts = campaign.get("target_system") + if not isinstance(ts, dict): + raise ValueError( + "Campaign config missing 'target_system' section. " + "See examples/campaign.yaml for the expected format." + ) + required = ["name", "description"] + missing = [k for k in required if k not in ts] + if missing: + raise ValueError( + f"Campaign 'target_system' missing required keys: {missing}. " + f"See examples/campaign.yaml for the expected format." + ) + for field in ("observable_metrics", "controllable_knobs"): + val = ts.get(field) + if val is not None: + if not isinstance(val, list) or not all(isinstance(x, str) for x in val): + raise ValueError( + f"Campaign 'target_system.{field}' must be a list of strings. " + f"Got: {val!r}" + ) + if "observational" in ts and not isinstance(ts["observational"], bool): + raise ValueError( + f"Campaign 'target_system.observational' must be a bool. " + f"Got: {ts['observational']!r}" + ) + class LLMDispatcher: """Dispatch agent roles to an LLM and produce schema-conformant artifacts.""" @@ -82,7 +126,7 @@ def __init__( completion_fn: Callable | None = None, ) -> None: self.work_dir = Path(work_dir) - self._validate_campaign(campaign) + validate_campaign(campaign) self.campaign = campaign self.model = model self.loader = PromptLoader( @@ -116,34 +160,7 @@ def __init__( dal, ) - @staticmethod - def _validate_campaign(campaign: dict) -> None: - ts = campaign.get("target_system") - if not isinstance(ts, dict): - raise ValueError( - "Campaign config missing 'target_system' section. " - "See examples/campaign.yaml for the expected format." - ) - required = ["name", "description"] - missing = [k for k in required if k not in ts] - if missing: - raise ValueError( - f"Campaign 'target_system' missing required keys: {missing}. " - f"See examples/campaign.yaml for the expected format." - ) - for field in ("observable_metrics", "controllable_knobs"): - val = ts.get(field) - if val is not None: - if not isinstance(val, list) or not all(isinstance(x, str) for x in val): - raise ValueError( - f"Campaign 'target_system.{field}' must be a list of strings. " - f"Got: {val!r}" - ) - if "observational" in ts and not isinstance(ts["observational"], bool): - raise ValueError( - f"Campaign 'target_system.observational' must be a bool. " - f"Got: {ts['observational']!r}" - ) + _validate_campaign = staticmethod(validate_campaign) # ------------------------------------------------------------------ # Public interface (satisfies Dispatcher protocol) @@ -259,6 +276,7 @@ def _build_context( "iteration": str(iteration), "execution_environment": _OBSERVATIONAL_EXECUTION_ENV if observational else _WORKTREE_EXECUTION_ENV, "worktree_constraint": _OBSERVATIONAL_DESIGN_CONSTRAINT if observational else _WORKTREE_DESIGN_CONSTRAINT, + "condition_reset": _OBSERVATIONAL_CONDITION_RESET if observational else _WORKTREE_CONDITION_RESET, } if phase == "design": diff --git a/prompts/methodology/execute_analyze.md b/prompts/methodology/execute_analyze.md index 3bc0970..1ddd34a 100644 --- a/prompts/methodology/execute_analyze.md +++ b/prompts/methodology/execute_analyze.md @@ -105,7 +105,7 @@ arms: ``` **Important:** -- All output paths MUST use absolute paths under `{{iter_dir}}/results/`. Do NOT use relative paths — the experiment runs in a worktree that gets cleaned up. +- All output paths MUST use absolute paths under `{{iter_dir}}/results/`. Do NOT use relative paths — only files under `{{iter_dir}}/` are guaranteed to persist past this session. - Create per-arm result subdirectories before writing output: `mkdir -p {{iter_dir}}/results/` (the top-level `results/` already exists, but per-arm subdirectories like `results/h-main/` do not). - If you create ANY input files for the experiment (config files, workload specs, policy definitions, parameter files), write them to `{{iter_dir}}/inputs/` and list them in the condition's `inputs` array. Do NOT write input files to `/tmp/` or other temporary locations — they will be lost and the experiment will not be reproducible. @@ -114,13 +114,13 @@ arms: Run the experiment plan you wrote in Step 4 — execute every command exactly as written. The plan is the source of truth. For each condition: -1. Reset worktree: `git checkout -- .` +1. {{condition_reset}} 2. Run the `cmd` from the plan 3. Verify the `output` file was created at the expected path After each baseline+treatment pair with the same seed, compare key metrics. If they are byte-identical, STOP and investigate — the patch may not be affecting the code path. -**All results must land in `{{iter_dir}}/results/`.** The worktree is temporary — anything written there will be lost. +**All results must land in `{{iter_dir}}/results/`.** Only files under `{{iter_dir}}/` are guaranteed to persist — anything written elsewhere may be lost. ## Phase 3: Analyze and Write Findings diff --git a/tests/test_observational.py b/tests/test_observational.py index 4e28005..04b08ed 100644 --- a/tests/test_observational.py +++ b/tests/test_observational.py @@ -13,7 +13,13 @@ from orchestrator.dispatch import StubDispatcher from orchestrator.engine import Engine from orchestrator.iteration import IterationOutcome, run_iteration -from orchestrator.llm_dispatch import LLMDispatcher +from orchestrator.llm_dispatch import ( + LLMDispatcher, + _OBSERVATIONAL_DESIGN_CONSTRAINT, + _OBSERVATIONAL_EXECUTION_ENV, + _WORKTREE_DESIGN_CONSTRAINT, + _WORKTREE_EXECUTION_ENV, +) class _CLIStub(StubDispatcher): @@ -82,7 +88,7 @@ def test_observational_omitted_accepted(self, tmp_path): def test_observational_non_bool_rejected(self): campaign = _campaign(observational=False) campaign["target_system"]["observational"] = "yes" - with pytest.raises(ValueError, match="observational.*must be a bool"): + with pytest.raises(ValueError, match="observational"): LLMDispatcher._validate_campaign(campaign) @@ -112,30 +118,31 @@ def _dispatcher(self, tmp_path, observational: bool) -> LLMDispatcher: def test_default_is_worktree(self, tmp_path): d = self._dispatcher(tmp_path, observational=False) ctx = d._build_context("planner", "design", iteration=1, perspective=None) - assert "isolated git worktree" in ctx["execution_environment"] - assert "Worktree isolation assumed" in ctx["worktree_constraint"] + assert ctx["execution_environment"] == _WORKTREE_EXECUTION_ENV + assert ctx["worktree_constraint"] == _WORKTREE_DESIGN_CONSTRAINT def test_observational_swaps_text(self, tmp_path): d = self._dispatcher(tmp_path, observational=True) ctx = d._build_context("planner", "design", iteration=1, perspective=None) - assert "Observational" in ctx["worktree_constraint"] - assert "no per-iteration git isolation" in ctx["execution_environment"] - assert "git worktree" not in ctx["execution_environment"] + # _OBSERVATIONAL_EXECUTION_ENV embeds {{iter_dir}}, so context-level + # equality holds (substitution happens later, in the loader). + assert ctx["execution_environment"] == _OBSERVATIONAL_EXECUTION_ENV + assert ctx["worktree_constraint"] == _OBSERVATIONAL_DESIGN_CONSTRAINT def test_design_template_renders_with_observational_constraint(self, tmp_path): - """End-to-end: load the real design.md template with observational - context and confirm the worktree paragraph is replaced. + """End-to-end: the real design.md picks up the observational constraint + and drops the worktree variant. """ d = self._dispatcher(tmp_path, observational=True) ctx = d._build_context("planner", "design", iteration=1, perspective=None) rendered = d.loader.load("design", ctx) - assert "Worktree isolation assumed" not in rendered - assert "Observational campaign" in rendered + assert _WORKTREE_DESIGN_CONSTRAINT not in rendered + assert _OBSERVATIONAL_DESIGN_CONSTRAINT in rendered def test_execute_analyze_template_renders_with_observational_env(self, tmp_path): - """End-to-end: load execute_analyze.md with observational context. - The {{iter_dir}} placeholder inside the observational text must also - be replaced (loader does sequential substitution). + """End-to-end: execute_analyze.md picks up the observational execution + environment. The {{iter_dir}} embedded in the observational text must + be substituted by the loader's sequential pass. """ d = self._dispatcher(tmp_path, observational=True) # _build_context for execute-analyze needs a bundle.yaml and handoff.md @@ -147,9 +154,12 @@ def test_execute_analyze_template_renders_with_observational_env(self, tmp_path) "executor", "execute-analyze", iteration=1, perspective=None, ) rendered = d.loader.load("execute_analyze", ctx) - assert "isolated git worktree" not in rendered - assert "no per-iteration git isolation" in rendered - assert "{{iter_dir}}" not in rendered # must be substituted + assert _WORKTREE_EXECUTION_ENV not in rendered + # The observational fragment is rendered AFTER {{iter_dir}} substitution, + # so we assert against the post-substitution version. + iter_dir = str((d.work_dir / "runs" / "iter-1").resolve()) + assert _OBSERVATIONAL_EXECUTION_ENV.replace("{{iter_dir}}", iter_dir) in rendered + assert "{{iter_dir}}" not in rendered # no leftover placeholders # --------------------------------------------------------------------------- @@ -157,12 +167,17 @@ def test_execute_analyze_template_renders_with_observational_env(self, tmp_path) # --------------------------------------------------------------------------- -def _setup_observational_iteration( - tmp_path: Path, monkeypatch, *, repo_path: Path, +def _setup_iteration( + tmp_path: Path, + monkeypatch, + *, + repo_path: Path, + observational: bool, ): - """Prepare a work_dir + campaign that uses observational mode and a - repo_path that is NOT a git repo. With the observational gate working, - run_iteration must complete without ever hitting create_experiment_worktree. + """Prepare a work_dir + campaign for an iteration test. Stubs the LLM and + CLI dispatchers and the human gate so run_iteration completes without an + API key. Use `observational=True` to test the live-target path, + `observational=False` to test the worktree path. """ work_dir = tmp_path / "work" work_dir.mkdir() @@ -172,7 +187,7 @@ def _setup_observational_iteration( state["run_id"] = "test" (work_dir / "state.json").write_text(json.dumps(state, indent=2)) - campaign = _campaign(observational=True, repo_path=repo_path) + campaign = _campaign(observational=observational, repo_path=repo_path) import orchestrator.iteration as ri @@ -196,6 +211,13 @@ def stub_factory(work_dir, campaign, model=None): return work_dir, campaign +def _setup_observational_iteration(tmp_path: Path, monkeypatch, *, repo_path: Path): + """Back-compat shim — observational helper preserved for clarity at call sites.""" + return _setup_iteration( + tmp_path, monkeypatch, repo_path=repo_path, observational=True, + ) + + class TestObservationalIterationFlow: def test_runs_without_git_repo(self, tmp_path, monkeypatch): """A non-git repo_path + observational=true must not raise @@ -242,3 +264,46 @@ def must_not_call(*a, **kw): assert not (work_dir / "runs" / "iter-1" / ".experiment_id").exists() # No .nous-experiments/ directory should appear in the target. assert not (repo / ".nous-experiments").exists() + + +class TestWorktreeIterationFlow: + """Regression: with observational=False (or omitted), repo_path must + still trigger create_experiment_worktree. Without this test, inverting + the gate at iteration.py would only break observational tests. + """ + + def test_worktree_created_when_not_observational(self, tmp_path, monkeypatch): + repo = tmp_path / "code-target" + repo.mkdir() + work_dir, campaign = _setup_iteration( + tmp_path, monkeypatch, repo_path=repo, observational=False, + ) + + create_calls: list[tuple] = [] + remove_calls: list[tuple] = [] + + def fake_create(repo_path, iteration): + create_calls.append((Path(repo_path), iteration)) + experiment_dir = tmp_path / "fake-worktree" + experiment_dir.mkdir(exist_ok=True) + return experiment_dir, "fake-experiment-id" + + def fake_remove(repo_path, experiment_id): + remove_calls.append((Path(repo_path), experiment_id)) + + monkeypatch.setattr( + "orchestrator.worktree.create_experiment_worktree", fake_create, + ) + monkeypatch.setattr( + "orchestrator.worktree.remove_experiment_worktree", fake_remove, + ) + + result = run_iteration(campaign, work_dir, iteration=1) + + assert result == IterationOutcome.COMPLETED + assert create_calls == [(repo, 1)] + assert remove_calls == [(repo, "fake-experiment-id")] + # .experiment_id file should be written in worktree mode. + assert ( + work_dir / "runs" / "iter-1" / ".experiment_id" + ).read_text() == "fake-experiment-id" From c4210479fc9324925d7b1b13bc6e26edb9853976 Mon Sep 17 00:00:00 2001 From: Nick Masluk Date: Wed, 27 May 2026 19:04:24 +0000 Subject: [PATCH 4/5] =?UTF-8?q?rename:=20observational=20=E2=86=92=20live?= =?UTF-8?q?=5Ftarget=20per=20reviewer=20feedback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer flagged that "observational" collides with the existing observe-mode in execute_analyze.md, which means "the bundle has no code_changes arms" — a bundle-level property, not the infra-level concern of whether to skip worktree creation. The new flag controls executor environment (live system vs. isolated worktree), so `live_target` is a more accurate name. Mechanical rename across iteration.py, llm_dispatch.py, campaign.schema.yaml, and the test module. Co-Authored-By: Claude Opus 4.7 --- orchestrator/iteration.py | 16 +-- orchestrator/llm_dispatch.py | 53 +++++---- orchestrator/schemas/campaign.schema.yaml | 4 +- ...t_observational.py => test_live_target.py} | 108 +++++++++--------- 4 files changed, 92 insertions(+), 89 deletions(-) rename tests/{test_observational.py => test_live_target.py} (71%) diff --git a/orchestrator/iteration.py b/orchestrator/iteration.py index 4f8dcfc..203cccf 100644 --- a/orchestrator/iteration.py +++ b/orchestrator/iteration.py @@ -257,7 +257,7 @@ def run_iteration( """ # Validate the campaign once, up front. The staticmethod on LLMDispatcher # is also called from its constructor, but inline-agent mode never builds - # an LLMDispatcher — without this call, a non-bool `observational` value + # an LLMDispatcher — without this call, a non-bool `live_target` value # would slip past validation and silently coerce via bool() below. from orchestrator.llm_dispatch import validate_campaign validate_campaign(campaign) @@ -377,10 +377,10 @@ def _max_turns_for(phase_key: str) -> int: cli_dispatcher.model = _model_for("execute_analyze") cli_dispatcher.max_turns = _max_turns_for("execute_analyze") exec_dispatcher = cli_dispatcher or llm_dispatcher - observational = bool( - campaign.get("target_system", {}).get("observational", False) + live_target = bool( + campaign.get("target_system", {}).get("live_target", False) ) - if repo_path and not observational: + if repo_path and not live_target: from orchestrator.worktree import ( create_experiment_worktree, remove_experiment_worktree, @@ -391,11 +391,11 @@ def _max_turns_for(phase_key: str) -> int: (iter_dir / ".experiment_id").write_text(experiment_id) print(f" Experiment worktree: {experiment_dir}") elif repo_path: - # Observational mode: executor runs directly in repo_path. The - # target system is live (cluster, service, dataset) and there is - # nothing to isolate — bundles must contain no code_changes arms. + # Live-target mode: executor runs directly in repo_path. The + # target system is running (cluster, service, dataset) and there + # is nothing to isolate — bundles must contain no code_changes arms. experiment_dir = Path(repo_path) - print(f" Observational mode: executor runs in {experiment_dir}") + print(f" Live target: executor runs in {experiment_dir}") if cli_dispatcher: import contextlib ctx = cli_dispatcher.override_cwd(experiment_dir) if experiment_dir else contextlib.nullcontext() diff --git a/orchestrator/llm_dispatch.py b/orchestrator/llm_dispatch.py index f61696f..506a2eb 100644 --- a/orchestrator/llm_dispatch.py +++ b/orchestrator/llm_dispatch.py @@ -35,23 +35,26 @@ # Schema cache: schema_name -> parsed schema dict _schema_cache: dict[str, dict] = {} -# Prompt fragments that swap based on target_system.observational. Worktree +# Prompt fragments that swap based on target_system.live_target. Worktree # mode is the default — code-evolution campaigns get an isolated git worktree -# per iteration. Observational mode is for live targets (clusters, services, +# per iteration. Live-target mode is for running systems (clusters, services, # datasets) that the executor probes without per-iteration code mutation. +# (The flag is `live_target` rather than `observational` to avoid colliding +# with the existing "observe mode" in execute_analyze.md, which means +# "the bundle has no code_changes arms.") _WORKTREE_EXECUTION_ENV = ( "You are running inside an isolated git worktree of the target system. " "You own this worktree — reset it yourself with `git checkout -- .` " "between conditions." ) -_OBSERVATIONAL_EXECUTION_ENV = ( - "You are running directly in the target system's working directory. " - "There is no per-iteration git isolation: this campaign is observational, " - "and your bundle must contain no `code_changes` arms. Do not mutate the " - "target system's persistent state — your job is to probe, measure, and " - "report. Treat any files you create as scratch artifacts that belong " - "under `{{iter_dir}}/inputs/` or `{{iter_dir}}/results/`, not in the " - "target directory." +_LIVE_TARGET_EXECUTION_ENV = ( + "You are running directly against a live target system, in its working " + "directory. There is no per-iteration git isolation, and your bundle " + "must contain no `code_changes` arms. Do not mutate the target system's " + "persistent state — your job is to probe, measure, and report. Treat " + "any files you create as scratch artifacts that belong under " + "`{{iter_dir}}/inputs/` or `{{iter_dir}}/results/`, not in the target " + "directory." ) _WORKTREE_DESIGN_CONSTRAINT = ( "**Worktree isolation assumed.** The executor runs in a clean git " @@ -59,19 +62,19 @@ "runs between conditions). Design your experimental conditions assuming " "this — don't include manual cleanup steps." ) -_OBSERVATIONAL_DESIGN_CONSTRAINT = ( - "**Observational campaign.** The executor runs directly against a live " - "target system — no git worktree, no code-change arms. All arms must be " - "pure observations of system state (probes, metrics, log scrapes). Do " - "not include `code_changes` in any arm; do not assume mutation is " - "possible without explicit consent gates." +_LIVE_TARGET_DESIGN_CONSTRAINT = ( + "**Live target system.** The executor runs directly against a running " + "system — no git worktree, no code-change arms. All arms must be pure " + "observations of system state (probes, metrics, log scrapes). Do not " + "include `code_changes` in any arm; do not assume mutation is possible " + "without explicit consent gates." ) # Per-condition reset step in execute_analyze.md Phase 2. Worktree mode resets -# tracked files between conditions; observational mode has no checkout to +# tracked files between conditions; live-target mode has no checkout to # revert and instead reminds the agent not to mutate the live target. _WORKTREE_CONDITION_RESET = "Reset worktree: `git checkout -- .`" -_OBSERVATIONAL_CONDITION_RESET = ( +_LIVE_TARGET_CONDITION_RESET = ( "Do not mutate the target system between conditions. Any files you " "wrote to the target directory during the previous condition must be " "removed before the next one runs (this is your responsibility — " @@ -105,10 +108,10 @@ def validate_campaign(campaign: dict) -> None: f"Campaign 'target_system.{field}' must be a list of strings. " f"Got: {val!r}" ) - if "observational" in ts and not isinstance(ts["observational"], bool): + if "live_target" in ts and not isinstance(ts["live_target"], bool): raise ValueError( - f"Campaign 'target_system.observational' must be a bool. " - f"Got: {ts['observational']!r}" + f"Campaign 'target_system.live_target' must be a bool. " + f"Got: {ts['live_target']!r}" ) @@ -266,7 +269,7 @@ def _build_context( perspective: str | None, ) -> dict[str, str]: ts = self.campaign["target_system"] - observational = bool(ts.get("observational", False)) + live_target = bool(ts.get("live_target", False)) ctx: dict[str, str] = { "target_system": ts["name"], "system_description": ts["description"], @@ -274,9 +277,9 @@ def _build_context( "controllable_knobs": ", ".join(ts["controllable_knobs"]) if ts.get("controllable_knobs") else "Not specified — planner should discover from code", "active_principles": self._format_principles(), "iteration": str(iteration), - "execution_environment": _OBSERVATIONAL_EXECUTION_ENV if observational else _WORKTREE_EXECUTION_ENV, - "worktree_constraint": _OBSERVATIONAL_DESIGN_CONSTRAINT if observational else _WORKTREE_DESIGN_CONSTRAINT, - "condition_reset": _OBSERVATIONAL_CONDITION_RESET if observational else _WORKTREE_CONDITION_RESET, + "execution_environment": _LIVE_TARGET_EXECUTION_ENV if live_target else _WORKTREE_EXECUTION_ENV, + "worktree_constraint": _LIVE_TARGET_DESIGN_CONSTRAINT if live_target else _WORKTREE_DESIGN_CONSTRAINT, + "condition_reset": _LIVE_TARGET_CONDITION_RESET if live_target else _WORKTREE_CONDITION_RESET, } if phase == "design": diff --git a/orchestrator/schemas/campaign.schema.yaml b/orchestrator/schemas/campaign.schema.yaml index 8ec8f85..af3d9df 100644 --- a/orchestrator/schemas/campaign.schema.yaml +++ b/orchestrator/schemas/campaign.schema.yaml @@ -53,9 +53,9 @@ properties: type: ["string", "null"] minLength: 1 description: "Path to target system git repo. Used by CLIDispatcher for code-access agents. If set, experiments run in isolated worktrees." - observational: + live_target: type: boolean - description: "If true, the executor runs directly in repo_path with no per-iteration git worktree. Use for live-system probing campaigns where there is no code to evolve. Bundles must contain no code_changes arms." + description: "If true, the executor runs directly in repo_path with no per-iteration git worktree. Use for campaigns that probe a running system (cluster, service, dataset) where there is no code to evolve. Bundles must contain no code_changes arms." models: type: object diff --git a/tests/test_observational.py b/tests/test_live_target.py similarity index 71% rename from tests/test_observational.py rename to tests/test_live_target.py index 04b08ed..87db9ac 100644 --- a/tests/test_observational.py +++ b/tests/test_live_target.py @@ -1,4 +1,4 @@ -"""Tests for observational mode — campaigns where the executor probes a live +"""Tests for live-target mode — campaigns where the executor probes a running target system instead of evolving code in a git worktree. """ import contextlib @@ -15,8 +15,8 @@ from orchestrator.iteration import IterationOutcome, run_iteration from orchestrator.llm_dispatch import ( LLMDispatcher, - _OBSERVATIONAL_DESIGN_CONSTRAINT, - _OBSERVATIONAL_EXECUTION_ENV, + _LIVE_TARGET_DESIGN_CONSTRAINT, + _LIVE_TARGET_EXECUTION_ENV, _WORKTREE_DESIGN_CONSTRAINT, _WORKTREE_EXECUTION_ENV, ) @@ -44,15 +44,15 @@ def override_cwd(self, _cwd): ) -def _campaign(observational: bool, repo_path: Path | None = None) -> dict: +def _campaign(live_target: bool, repo_path: Path | None = None) -> dict: target = { "name": "TestSystem", "description": "A live target with no code to evolve.", "observable_metrics": ["latency_ms"], "controllable_knobs": ["config"], } - if observational: - target["observational"] = True + if live_target: + target["live_target"] = True if repo_path is not None: target["repo_path"] = str(repo_path) return { @@ -71,24 +71,24 @@ def _campaign(observational: bool, repo_path: Path | None = None) -> dict: class TestCampaignValidation: - def test_observational_true_accepted(self, tmp_path): - campaign = _campaign(observational=True) + def test_live_target_true_accepted(self, tmp_path): + campaign = _campaign(live_target=True) # Must not raise. LLMDispatcher._validate_campaign(campaign) - def test_observational_false_accepted(self, tmp_path): - campaign = _campaign(observational=False) + def test_live_target_false_accepted(self, tmp_path): + campaign = _campaign(live_target=False) LLMDispatcher._validate_campaign(campaign) - def test_observational_omitted_accepted(self, tmp_path): - campaign = _campaign(observational=False) - assert "observational" not in campaign["target_system"] + def test_live_target_omitted_accepted(self, tmp_path): + campaign = _campaign(live_target=False) + assert "live_target" not in campaign["target_system"] LLMDispatcher._validate_campaign(campaign) - def test_observational_non_bool_rejected(self): - campaign = _campaign(observational=False) - campaign["target_system"]["observational"] = "yes" - with pytest.raises(ValueError, match="observational"): + def test_live_target_non_bool_rejected(self): + campaign = _campaign(live_target=False) + campaign["target_system"]["live_target"] = "yes" + with pytest.raises(ValueError, match="live_target"): LLMDispatcher._validate_campaign(campaign) @@ -99,11 +99,11 @@ def test_observational_non_bool_rejected(self): class TestPromptFragmentSelection: """The execution_environment and worktree_constraint placeholders swap - based on target_system.observational. The prompt loader will substitute + based on target_system.live_target. The prompt loader will substitute them into the design and execute_analyze templates. """ - def _dispatcher(self, tmp_path, observational: bool) -> LLMDispatcher: + def _dispatcher(self, tmp_path, live_target: bool) -> LLMDispatcher: # Seed the work_dir with the run_id only — no API key needed because # _build_context never calls the LLM. work_dir = tmp_path / "work" @@ -111,40 +111,40 @@ def _dispatcher(self, tmp_path, observational: bool) -> LLMDispatcher: (work_dir / "runs" / "iter-1").mkdir(parents=True) return LLMDispatcher( work_dir=work_dir, - campaign=_campaign(observational=observational), + campaign=_campaign(live_target=live_target), completion_fn=lambda **kw: None, ) def test_default_is_worktree(self, tmp_path): - d = self._dispatcher(tmp_path, observational=False) + d = self._dispatcher(tmp_path, live_target=False) ctx = d._build_context("planner", "design", iteration=1, perspective=None) assert ctx["execution_environment"] == _WORKTREE_EXECUTION_ENV assert ctx["worktree_constraint"] == _WORKTREE_DESIGN_CONSTRAINT - def test_observational_swaps_text(self, tmp_path): - d = self._dispatcher(tmp_path, observational=True) + def test_live_target_swaps_text(self, tmp_path): + d = self._dispatcher(tmp_path, live_target=True) ctx = d._build_context("planner", "design", iteration=1, perspective=None) - # _OBSERVATIONAL_EXECUTION_ENV embeds {{iter_dir}}, so context-level + # _LIVE_TARGET_EXECUTION_ENV embeds {{iter_dir}}, so context-level # equality holds (substitution happens later, in the loader). - assert ctx["execution_environment"] == _OBSERVATIONAL_EXECUTION_ENV - assert ctx["worktree_constraint"] == _OBSERVATIONAL_DESIGN_CONSTRAINT + assert ctx["execution_environment"] == _LIVE_TARGET_EXECUTION_ENV + assert ctx["worktree_constraint"] == _LIVE_TARGET_DESIGN_CONSTRAINT - def test_design_template_renders_with_observational_constraint(self, tmp_path): - """End-to-end: the real design.md picks up the observational constraint + def test_design_template_renders_with_live_target_constraint(self, tmp_path): + """End-to-end: the real design.md picks up the live-target constraint and drops the worktree variant. """ - d = self._dispatcher(tmp_path, observational=True) + d = self._dispatcher(tmp_path, live_target=True) ctx = d._build_context("planner", "design", iteration=1, perspective=None) rendered = d.loader.load("design", ctx) assert _WORKTREE_DESIGN_CONSTRAINT not in rendered - assert _OBSERVATIONAL_DESIGN_CONSTRAINT in rendered + assert _LIVE_TARGET_DESIGN_CONSTRAINT in rendered - def test_execute_analyze_template_renders_with_observational_env(self, tmp_path): - """End-to-end: execute_analyze.md picks up the observational execution - environment. The {{iter_dir}} embedded in the observational text must + def test_execute_analyze_template_renders_with_live_target_env(self, tmp_path): + """End-to-end: execute_analyze.md picks up the live-target execution + environment. The {{iter_dir}} embedded in the live-target text must be substituted by the loader's sequential pass. """ - d = self._dispatcher(tmp_path, observational=True) + d = self._dispatcher(tmp_path, live_target=True) # _build_context for execute-analyze needs a bundle.yaml and handoff.md bundle_path = d.work_dir / "runs" / "iter-1" / "bundle.yaml" bundle_path.write_text("metadata:\n iteration: 1\n") @@ -155,15 +155,15 @@ def test_execute_analyze_template_renders_with_observational_env(self, tmp_path) ) rendered = d.loader.load("execute_analyze", ctx) assert _WORKTREE_EXECUTION_ENV not in rendered - # The observational fragment is rendered AFTER {{iter_dir}} substitution, + # The live-target fragment is rendered AFTER {{iter_dir}} substitution, # so we assert against the post-substitution version. iter_dir = str((d.work_dir / "runs" / "iter-1").resolve()) - assert _OBSERVATIONAL_EXECUTION_ENV.replace("{{iter_dir}}", iter_dir) in rendered + assert _LIVE_TARGET_EXECUTION_ENV.replace("{{iter_dir}}", iter_dir) in rendered assert "{{iter_dir}}" not in rendered # no leftover placeholders # --------------------------------------------------------------------------- -# Iteration loop: observational mode skips worktree creation +# Iteration loop: live-target mode skips worktree creation # --------------------------------------------------------------------------- @@ -172,12 +172,12 @@ def _setup_iteration( monkeypatch, *, repo_path: Path, - observational: bool, + live_target: bool, ): """Prepare a work_dir + campaign for an iteration test. Stubs the LLM and CLI dispatchers and the human gate so run_iteration completes without an - API key. Use `observational=True` to test the live-target path, - `observational=False` to test the worktree path. + API key. Use `live_target=True` to test the live-target path, + `live_target=False` to test the worktree path. """ work_dir = tmp_path / "work" work_dir.mkdir() @@ -187,7 +187,7 @@ def _setup_iteration( state["run_id"] = "test" (work_dir / "state.json").write_text(json.dumps(state, indent=2)) - campaign = _campaign(observational=observational, repo_path=repo_path) + campaign = _campaign(live_target=live_target, repo_path=repo_path) import orchestrator.iteration as ri @@ -211,23 +211,23 @@ def stub_factory(work_dir, campaign, model=None): return work_dir, campaign -def _setup_observational_iteration(tmp_path: Path, monkeypatch, *, repo_path: Path): - """Back-compat shim — observational helper preserved for clarity at call sites.""" +def _setup_live_target_iteration(tmp_path: Path, monkeypatch, *, repo_path: Path): + """Convenience wrapper for the live-target path.""" return _setup_iteration( - tmp_path, monkeypatch, repo_path=repo_path, observational=True, + tmp_path, monkeypatch, repo_path=repo_path, live_target=True, ) -class TestObservationalIterationFlow: +class TestLiveTargetIterationFlow: def test_runs_without_git_repo(self, tmp_path, monkeypatch): - """A non-git repo_path + observational=true must not raise + """A non-git repo_path + live_target=true must not raise FileNotFoundError('Not a git repository') and must complete the iteration. This is the regression for the magic.yaml campaign. """ repo = tmp_path / "live-target" repo.mkdir() # NOT a git repo — no .git/ here. - work_dir, campaign = _setup_observational_iteration( + work_dir, campaign = _setup_live_target_iteration( tmp_path, monkeypatch, repo_path=repo, ) result = run_iteration(campaign, work_dir, iteration=1) @@ -237,7 +237,7 @@ def test_runs_without_git_repo(self, tmp_path, monkeypatch): def test_no_experiment_worktree_created(self, tmp_path, monkeypatch): repo = tmp_path / "live-target" repo.mkdir() - work_dir, campaign = _setup_observational_iteration( + work_dir, campaign = _setup_live_target_iteration( tmp_path, monkeypatch, repo_path=repo, ) @@ -250,7 +250,7 @@ def must_not_call(*a, **kw): called["n"] += 1 raise AssertionError( "create_experiment_worktree must not be called in " - "observational mode" + "live-target mode" ) monkeypatch.setattr( @@ -260,23 +260,23 @@ def must_not_call(*a, **kw): run_iteration(campaign, work_dir, iteration=1) assert called["n"] == 0 - # No .experiment_id file should be written in observational mode. + # No .experiment_id file should be written in live-target mode. assert not (work_dir / "runs" / "iter-1" / ".experiment_id").exists() # No .nous-experiments/ directory should appear in the target. assert not (repo / ".nous-experiments").exists() class TestWorktreeIterationFlow: - """Regression: with observational=False (or omitted), repo_path must + """Regression: with live_target=False (or omitted), repo_path must still trigger create_experiment_worktree. Without this test, inverting - the gate at iteration.py would only break observational tests. + the gate at iteration.py would only break live-target tests. """ - def test_worktree_created_when_not_observational(self, tmp_path, monkeypatch): + def test_worktree_created_when_not_live_target(self, tmp_path, monkeypatch): repo = tmp_path / "code-target" repo.mkdir() work_dir, campaign = _setup_iteration( - tmp_path, monkeypatch, repo_path=repo, observational=False, + tmp_path, monkeypatch, repo_path=repo, live_target=False, ) create_calls: list[tuple] = [] From 274a6e41554dd8e36f9c98c7d4166de047a21127 Mon Sep 17 00:00:00 2001 From: Nick Masluk Date: Tue, 2 Jun 2026 16:04:26 +0000 Subject: [PATCH 5/5] docs: document live_target campaigns in README and quickstart Reviewer asked for user-facing docs on when and how to use live_target: true so the feature is discoverable without reading the PR description or schema. Adds a quickstart section with an example campaign and contrasts live_target (campaign-level, no worktree, all arms must be probes) with observe-mode arms (bundle-level, worktree still created). README points to the new section. Co-Authored-By: Claude Opus 4.7 --- README.md | 2 ++ docs/quickstart.md | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/README.md b/README.md index 706c21e..07f0ecc 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,8 @@ When `repo_path` is set, the campaign directory is created inside the target rep The planner explores the codebase to discover metrics, knobs, and execution methods. You can optionally provide `observable_metrics` and `controllable_knobs` as hints — see [examples/campaign.yaml](examples/campaign.yaml) for all options. +If your target is a *running* system rather than a codebase (a cluster, a deployed service, a scratch directory that isn't a git repo), set `target_system.live_target: true`. The executor then runs directly in `repo_path` with no per-iteration `git worktree`, and the planner is told up front that arms must be probes — see [docs/quickstart.md#live-target-campaigns-live_target-true](docs/quickstart.md#live-target-campaigns-live_target-true) for details. + ### 5. Run a campaign ```bash diff --git a/docs/quickstart.md b/docs/quickstart.md index 69e0fd2..aa9e91e 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -125,6 +125,43 @@ After a campaign, your working directory contains: - **`runs/iter-N/inputs/`** — Agent-created input files (configs, workloads) - **`runs/iter-N/results/`** — Experiment output files +## Live-target campaigns (`live_target: true`) + +By default Nous treats `repo_path` as a git repo and creates a fresh `git worktree` per iteration so that any source-code patches are isolated. For some campaigns there is no codebase to evolve — the thing you want to study is a *running* system: a Kubernetes cluster, a deployed service, a dataset on disk, a non-git scratch directory. Setting `live_target: true` tells Nous to skip worktree creation and run the executor directly inside `repo_path`. + +Use it when: + +- The target is a live system you are probing, not a codebase you are mutating (e.g. a GPU cluster, a production-like service, a workload generator). +- `repo_path` points at a directory that is not a git repo, or is a git repo whose working tree must not be branched. +- The bundle should only contain probe-style arms (config tweaks, command-line invocations, observation runs) — never `code_changes`. + +Example: + +```yaml +research_question: > + Why does p99 latency spike when the cluster autoscaler kicks in? + +target_system: + name: "Staging GPU cluster" + description: > + Live Kubernetes cluster running our inference workload. + The agent probes the cluster via kubectl and Prometheus; it does + not modify source code. + repo_path: /scratch/cluster-probe # any working directory; need not be a git repo + live_target: true + +prompts: + methodology_layer: "prompts/methodology" + domain_adapter_layer: null +``` + +How `live_target` differs from regular observe-mode arms: + +- **Observe mode** is a *bundle-level* property — an individual arm has no `code_changes`, so the executor skips patching and just runs commands. The campaign can still mix observe arms and evolve arms in the same bundle, and a worktree is still created. +- **`live_target: true`** is a *campaign-level* property — it controls the *executor environment* (no worktree, run in `repo_path` directly) and tells the planner up front that the target is a shared running system, so every arm must be a probe. Bundles with `code_changes` arms are incoherent in this mode. + +Pick `live_target: true` when there is nothing meaningful to branch from; pick observe-mode arms when you have a real codebase but a particular iteration only needs to measure, not patch. + ## Choosing a model Defaults (from `defaults.yaml`):