Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,10 @@ when ALL of these hold**:
4. The campaign's apparatus checks are robust to design-agent
variation, and validate ATTRIBUTION (not just upstream totals,
#252 / F7).
5. A stale ``principles.json`` ledger is acceptable. Auto-approve
never gates on it.

(Note: ``principles.json`` staleness is NOT a precondition.
Auto-approve never gates on it, so the ledger's freshness doesn't
affect whether the gate would have caught a deviation.)

**If any of these fail**, either run interactively (no
``--auto-approve``) so a human reviewer sees the design at the gate,
Expand Down
2 changes: 1 addition & 1 deletion docs/friction-245-resolution.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ implementation in one hop.
| F9 | [#254](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/254) | LOW | `nous clean --orphaned` subcommand in `cli.py` (`_cmd_clean`); supports `--target-repo`, `--campaign`, `--dry-run` | (CLI smoke; mirrors `gc_orphan_worktrees`) |
| F10 | [#255](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/255) | MED | New section "`--auto-approve` safety preconditions" in `README.md`; `--auto-approve` help text references it | (docs only) |
| F11 | [#256](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/256) | MED | `_emit_high_build_warning` in `iteration.py` runs after DESIGN; emits a sized recommendation to raise `max_turns.execute_analyze` | `tests/test_friction_245.py::test_f11_*` |
| F12 | [#257](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/257) | LOW | `aiter_with_silence_watchdog`'s `aclose` path now wraps in `asyncio.wait_for(timeout=5)` and explicitly catches `(TimeoutError, CancelledError, RuntimeError, GeneratorExit)` | (covered by existing watchdog tests; race is non-deterministic) |
| F12 | [#257](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/257) | LOW | `aiter_with_silence_watchdog`'s `aclose` path now wraps in `asyncio.wait_for(timeout=5)` and explicitly catches `(TimeoutError, CancelledError, RuntimeError, GeneratorExit)`; broad fallback now logs at WARNING instead of swallowing silently | `tests/test_friction_245.py::test_f12_*` |
| F13 | [#258](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/258) | HIGH | `nous create-campaign` scaffold gains a commented `locked_parameters` block + `locked_workload`, `derived_from`, `sdk_timeouts.turn_silence_threshold_seconds` (per-phase), `plot_specs`. New `docs/campaign-authoring-guide.md` includes the "what to lock" inventory | (existing scaffold tests cover schema-validity) |
| F14 | [#259](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/259) | N/A | `docs/campaign-authoring-guide.md` includes "Rehearsal as scientific instrument" section + "Pre-lock unit check" | (docs only) |
| F15 | [#260](https://github.com/AI-native-Systems-Research/agentic-strategy-evolution/issues/260) | HIGH | `bundle.experiment_spec.physical_realism_check` schema + `_validate_physical_realism` soft-warn when `k_realism_ratio < 0.5` and justification is empty/perfunctory | `tests/test_friction_245.py::test_f15_*` |
Expand Down
4 changes: 3 additions & 1 deletion orchestrator/campaign.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,9 @@ def main() -> None:

run_id = args.run_id or campaign.get("run_id") or campaign_path.parent.name + "-run"
repo_path = campaign.get("target_system", {}).get("repo_path")
work_dir = setup_work_dir(run_id, repo_path=repo_path)
work_dir = setup_work_dir(
run_id, repo_path=repo_path, campaign_path=str(campaign_path),
)
print(f"Working directory: {work_dir.resolve()}")
print(f"Max iterations: {max_iter}")

Expand Down
53 changes: 36 additions & 17 deletions orchestrator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ def _cmd_run(args):
file=sys.stderr,
)

work_dir = setup_work_dir(run_id, repo_path=repo_path)
work_dir = setup_work_dir(
run_id, repo_path=repo_path, campaign_path=str(campaign_path),
)

max_iterations = args.max_iterations if args.max_iterations is not None else campaign.get("max_iterations", 10)
# #188: --bundle / --problem-md / --handoff-md only apply to iter-1.
Expand Down Expand Up @@ -303,8 +305,10 @@ def _cmd_resume(args):
f"Got: {args.target}\n"
f"This appears to be a work_dir. Use ``nous status "
f"{args.target}`` to inspect the work_dir; ``nous resume`` "
f"needs the campaign yaml so it can re-validate the spec "
f"and re-emit reproducibility metadata (#253 / F8)."
f"needs the campaign yaml so it can re-validate the spec. "
f"(reproducibility_metadata captured at the original INIT "
f"is preserved — first-capture-wins, #262 / F17.) "
f"(#253 / F8)"
f"{hint}",
file=sys.stderr,
)
Expand Down Expand Up @@ -934,8 +938,17 @@ def _cmd_clean(args):


def _cmd_package(args):
"""#263 (F18): tarball work_dir + reproduce.sh + Dockerfile + README."""
"""#263 (F18): tarball work_dir + reproduce.sh + Dockerfile + README.

Staging artifacts (reproduce.sh, Dockerfile, PACKAGE_README.md) are
written to a temp directory and added to the tarball at the
``<run_id>/`` prefix — they never touch the work_dir on disk.
Successive runs of ``nous package`` produce identical tarballs
without accumulating staging files in the campaign's persistent
state.
"""
import tarfile
import tempfile
import textwrap

work_dir = resolve_work_dir(args.target)
Expand Down Expand Up @@ -1009,15 +1022,20 @@ def _cmd_package(args):
```
""")

# Stage these alongside the work_dir for tar inclusion.
pkg_root = work_dir
(pkg_root / "reproduce.sh").write_text(reproduce_sh)
(pkg_root / "reproduce.sh").chmod(0o755)
(pkg_root / "Dockerfile").write_text(dockerfile)
(pkg_root / "PACKAGE_README.md").write_text(readme)

with tarfile.open(output, "w:gz") as tar:
tar.add(work_dir, arcname=work_dir.name)
# Stage to a temp directory (gone after this command), tar both
# work_dir AND the staged files at <run_id>/ prefix. The work_dir
# on disk is unchanged — this command is read-only with respect
# to the campaign's persistent state.
with tempfile.TemporaryDirectory() as tmp_root:
tmp = Path(tmp_root)
(tmp / "reproduce.sh").write_text(reproduce_sh)
(tmp / "reproduce.sh").chmod(0o755)
(tmp / "Dockerfile").write_text(dockerfile)
(tmp / "PACKAGE_README.md").write_text(readme)
with tarfile.open(output, "w:gz") as tar:
tar.add(work_dir, arcname=work_dir.name)
for staged in ("reproduce.sh", "Dockerfile", "PACKAGE_README.md"):
tar.add(tmp / staged, arcname=f"{work_dir.name}/{staged}")
print(f"Wrote {output}")


Expand Down Expand Up @@ -1184,10 +1202,11 @@ def main():
p_stop.add_argument(
"--immediate", action="store_true",
help="Event-boundary halt (#250 / F5). Writes a STOP_IMMEDIATE "
"sentinel that the SDK turn loop checks at each tool-call "
"return — aborts within seconds rather than at the next "
"phase boundary. Use when EXECUTE_ANALYZE is building "
"wrong code and you want to halt promptly.",
"sentinel that the SDK turn loop checks at each event "
"boundary (every SDK message) — aborts within seconds "
"rather than at the next phase boundary. Use when "
"EXECUTE_ANALYZE is building wrong code and you want "
"to halt promptly.",
)
p_stop.set_defaults(func=_cmd_stop)

Expand Down
139 changes: 100 additions & 39 deletions orchestrator/iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,17 @@

from orchestrator.engine import Engine
from orchestrator.gates import HumanGate
from orchestrator.lineage import (
apply_derived_from_patch,
emit_cumulative_patch,
resolve_derived_from,
)
from orchestrator.llm_dispatch import LLMDispatcher
from orchestrator.plot_specs import invoke_plot_specs
from orchestrator.reproducibility import (
capture_reproducibility_metadata,
snapshot_iter_files,
)
from orchestrator.util import atomic_write

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -724,7 +734,32 @@ def _merge_principles(work_dir: Path, iter_dir: Path) -> None:
atomic_write(principles_path, json.dumps(store, indent=2) + "\n")


def setup_work_dir(run_id: str, repo_path: str | None = None) -> Path:
def _campaign_yaml_dir_from_state(work_dir: Path) -> Path | None:
"""#263 (F18): resolve the campaign.yaml's directory from state.json.

Plot scripts declared in ``campaign.plot_specs[].script`` are
relative to the campaign.yaml's directory. ``config_ref`` is
recorded in state.json at ``setup_work_dir`` time; this helper
is the single read site so a legacy campaign without
``config_ref`` returns ``None`` rather than guessing.
"""
state_path = work_dir / "state.json"
if not state_path.exists():
return None
try:
state = json.loads(state_path.read_text())
except (OSError, json.JSONDecodeError):
return None
config_ref = state.get("config_ref") if isinstance(state, dict) else None
if not config_ref:
return None
return Path(config_ref).parent


def setup_work_dir(
run_id: str, repo_path: str | None = None,
campaign_path: str | None = None,
) -> Path:
"""Create and initialize a working directory from templates.

See ``orchestrator/work_dir_resolver.py`` for the canonical
Expand Down Expand Up @@ -817,13 +852,20 @@ def setup_work_dir(run_id: str, repo_path: str | None = None) -> Path:
# detection and future cross-machine discovery.
state["work_dir"] = str(work_dir.resolve())
state["repo_path"] = str(Path(repo_path).resolve()) if repo_path else None
# #263 (F18): record the campaign.yaml's absolute path so
# plot_specs scripts (declared relative to that file) can be
# resolved at REPORT/finalize time. Only set when provided —
# don't clobber a value already recorded by a prior setup.
if campaign_path is not None:
state["config_ref"] = str(Path(campaign_path).resolve())
elif "config_ref" not in state:
state["config_ref"] = None
# #262 (F17): auto-capture reproducibility metadata at INIT (before
# any DESIGN turn fires). First capture wins — re-running INIT on
# an existing campaign preserves the original commit/dirty/sha
# values, which is what reviewers want (the state at campaign
# start, not at iter-3 resume time).
if "reproducibility_metadata" not in state:
from orchestrator.reproducibility import capture_reproducibility_metadata
state["reproducibility_metadata"] = capture_reproducibility_metadata(
Path(repo_path) if repo_path else None
)
Expand All @@ -850,10 +892,12 @@ def setup_work_dir(run_id: str, repo_path: str | None = None) -> Path:
def _emit_high_build_warning(bundle_path: Path, max_turns_execute_analyze: int) -> None:
"""#256 (F11): warn when bundle.code_changes implies a high BUILD count.

Threshold heuristic: ``len(arms-with-code_changes) >= 5`` OR
``total_files >= 5``. Below the threshold, no warning. At/above,
Threshold: ``total_files >= 5`` (sum of code_changes entries
across all arms with a non-empty list). Below 5, silent. At/above,
print a recommendation that the operator raise
``campaign.max_turns.execute_analyze`` to ~``120 + 30 * total_files``.
``campaign.max_turns.execute_analyze`` to ~``120 + 30 * total_files``,
suppressed when the operator already set a value at or above
that target.

Pure print — never raises — so a misshapen bundle that the schema
validator already rejected doesn't fail this advisory pass.
Expand Down Expand Up @@ -1090,13 +1134,14 @@ def _max_turns_for(phase_key: str) -> int:
# #262 (F17): snapshot latency/hardware config files into
# runs/iter-N/snapshots/ so a future reviewer can diff the exact
# numbers each iter ran with — even if the operator later edits the
# source-of-truth file in the target repo. Best-effort; missing
# files are skipped (the candidate list is target-agnostic).
if repo_path and not (iter_dir / "snapshots").exists():
# source-of-truth file in the target repo. Idempotency lives in
# snapshot_iter_files itself (mkdir exist_ok + content overwrite),
# so we don't gate on the directory's existence — that would skip
# re-snapshotting on resume after a manual touch.
if repo_path:
try:
from orchestrator.reproducibility import snapshot_iter_files
snapshot_iter_files(Path(repo_path), iter_dir)
except (OSError, ImportError) as exc:
except OSError as exc:
logger.warning("repro snapshot for iter-%d skipped: %s", iteration, exc)

if engine.phase == "DONE":
Expand Down Expand Up @@ -1268,23 +1313,17 @@ def _max_turns_for(phase_key: str) -> int:
# preflight if the campaign declares one. Failure to apply
# is surfaced loudly (the user must rebase the prior
# campaign or update derived_from.iteration); we do NOT
# silently proceed.
try:
from orchestrator.lineage import (
apply_derived_from_patch,
resolve_derived_from,
)
derived_patch = resolve_derived_from(
campaign, repo_path=Path(repo_path),
)
if derived_patch is not None:
ok, msg = apply_derived_from_patch(experiment_dir, derived_patch)
if ok:
print(f" derived_from: {msg}")
else:
raise RuntimeError(msg)
except ImportError:
pass
# silently proceed. Imports are at module top — a broken
# orchestrator.lineage is a self-inflicted bug, not an
# optional dependency.
derived_patch = resolve_derived_from(
campaign, repo_path=Path(repo_path),
)
if derived_patch is not None:
ok, msg = apply_derived_from_patch(experiment_dir, derived_patch)
if not ok:
raise RuntimeError(msg)
print(f" derived_from: {msg}")
if cli_dispatcher:
import contextlib
ctx = cli_dispatcher.override_cwd(experiment_dir) if experiment_dir else contextlib.nullcontext()
Expand Down Expand Up @@ -1378,15 +1417,23 @@ def _max_turns_for(phase_key: str) -> int:
# the experiment worktree branch. The cumulative form is what
# future ``derived_from`` campaigns reuse; the per-arm patches
# are incremental on the branch state and don't apply to a
# fresh main checkout.
# fresh main checkout. emit_cumulative_patch is best-effort
# internally (returns None on git failure) and writes a
# cumulative.patch.error sidecar so a later operator inspecting
# ``nous lineage`` can see why inheritance broke.
if repo_path and experiment_id:
try:
from orchestrator.lineage import emit_cumulative_patch
emit_cumulative_patch(
Path(repo_path), f"nous-exp-{experiment_id}", iter_dir,
cumulative_path = emit_cumulative_patch(
Path(repo_path), f"nous-exp-{experiment_id}", iter_dir,
)
if cumulative_path is None:
# I1: surface the failure to the user-facing console too,
# not just orchestrator.log — `derived_from` campaigns
# depend on this artifact.
print(
f" ⚠ cumulative.patch emit failed for iter-{iteration} — "
f"see {iter_dir / 'patches' / 'cumulative.patch.error'} "
f"if you plan to derive a future campaign from this run."
)
except (ImportError, OSError) as exc:
logger.warning("cumulative patch emit skipped: %s", exc)
# Clean up worktree only on success
if repo_path and experiment_id:
remove_experiment_worktree(Path(repo_path), experiment_id)
Expand Down Expand Up @@ -1433,15 +1480,27 @@ def _max_turns_for(phase_key: str) -> int:
iteration=iteration, campaign=campaign,
)
# #263 (F18): invoke plot_specs scripts after findings.json exists.
# Best-effort — plot failures never block the iteration.
# Best-effort — plot failures never block the iteration. The
# campaign_yaml_dir is read from state.json (recorded at INIT)
# so script paths declared in campaign.plot_specs[].script
# resolve relative to the campaign.yaml's directory, not
# work_dir's parent.
if campaign.get("plot_specs"):
try:
from orchestrator.plot_specs import invoke_plot_specs
results = invoke_plot_specs(campaign, iter_dir)
campaign_yaml_dir = _campaign_yaml_dir_from_state(work_dir)
results = invoke_plot_specs(
campaign, iter_dir, campaign_yaml_dir=campaign_yaml_dir,
)
ok = sum(1 for r in results if r.get("ok"))
print(f" plot_specs: {ok}/{len(results)} succeeded → "
f"{iter_dir / 'figures'}")
except (ImportError, OSError) as exc:
# Persist the per-spec result rows so failures are
# inspectable post-hoc, not just in the orchestrator log.
atomic_write(
iter_dir / "figures" / "plot_specs_results.json",
json.dumps(results, indent=2) + "\n",
)
except OSError as exc:
logger.warning("plot_specs invocation skipped: %s", exc)
print(f" -> Principles merged into {work_dir / 'principles.json'}")
print(f" -> best_found.json updated at {work_dir / 'best_found.json'}")
Expand Down Expand Up @@ -1509,7 +1568,9 @@ def main() -> None:

run_id = args.run_id or campaign.get("run_id") or campaign_path.parent.name + "-run"
repo_path = campaign.get("target_system", {}).get("repo_path")
work_dir = setup_work_dir(run_id, repo_path=repo_path)
work_dir = setup_work_dir(
run_id, repo_path=repo_path, campaign_path=str(campaign_path),
)
print(f"Working directory: {work_dir.resolve()}")

run_iteration(
Expand Down
Loading
Loading