Skip to content

Commit ed4f59f

Browse files
committed
Ablation experiment runner, convergence plots, edge case hardening
- run_experiment.py: orchestrates 4 ablations × 3 benchmarks systematically - analysis/plot_convergence.py: per-cycle and best-so-far charts from log data - mutation_engine: benchmark weights updated to new names, empty/short prompt guards - population_manager: select_tournament returns None on empty pop (no crash) - infinite_research_loop: benchmark_name param for fixed-benchmark runs, ablation config embedded in log entries for filtering, redundant load removed
1 parent fbaf864 commit ed4f59f

7 files changed

Lines changed: 598 additions & 52 deletions

File tree

analysis/plot_convergence.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#!/usr/bin/env python3
2+
"""Plot grounded evolution convergence curves from experiment data.
3+
4+
Usage:
5+
python analysis/plot_convergence.py # Use main run_log.jsonl
6+
python analysis/plot_convergence.py --ablation # Use per-condition files
7+
python analysis/plot_convergence.py --ablation --rolling 5 # Rolling average
8+
9+
Output: PNG files in analysis/charts/
10+
"""
11+
12+
import json
13+
import sys
14+
from collections import defaultdict
15+
from pathlib import Path
16+
from typing import Any
17+
18+
19+
CHARTS_DIR: Path = Path("analysis/charts")
20+
ROLLING_WINDOW: int = 10 # default rolling average window
21+
22+
23+
def load_main_log() -> list[dict[str, Any]]:
24+
"""Load all cycles from the main experiment log."""
25+
log_path: Path = Path("experiments/run_log.jsonl")
26+
if not log_path.exists():
27+
print("No experiment log found at experiments/run_log.jsonl")
28+
sys.exit(1)
29+
return [json.loads(line) for line in log_path.read_text().strip().splitlines() if line]
30+
31+
32+
def load_ablation_runs() -> dict[str, list[dict[str, Any]]]:
33+
"""Load per-condition results from experiments/ablation_runs/*.jsonl."""
34+
runs_dir: Path = Path("experiments/ablation_runs")
35+
if not runs_dir.exists():
36+
print("No ablation runs found at experiments/ablation_runs/")
37+
sys.exit(1)
38+
39+
results: dict[str, list[dict[str, Any]]] = {}
40+
for fpath in sorted(runs_dir.glob("*.jsonl")):
41+
condition: str = fpath.stem
42+
results[condition] = [
43+
json.loads(line) for line in fpath.read_text().strip().splitlines() if line
44+
]
45+
return results
46+
47+
48+
def rolling_average(values: list[float], window: int) -> list[float]:
49+
"""Compute rolling average with the given window size."""
50+
if not values or window <= 1:
51+
return list(values)
52+
smoothed: list[float] = []
53+
for i in range(len(values)):
54+
start: int = max(0, i - window + 1)
55+
chunk: list[float] = values[start:i + 1]
56+
smoothed.append(sum(chunk) / len(chunk))
57+
return smoothed
58+
59+
60+
def plot_main_convergence(records: list[dict[str, Any]]) -> None:
61+
"""Plot overall score vs cycles from the main log."""
62+
try:
63+
import matplotlib
64+
matplotlib.use("Agg")
65+
import matplotlib.pyplot as plt
66+
except ImportError:
67+
print("matplotlib not installed. Install it with: pip install matplotlib")
68+
return
69+
70+
CHARTS_DIR.mkdir(parents=True, exist_ok=True)
71+
72+
scores: list[float] = [r.get("score", 0) for r in records]
73+
best: list[float] = []
74+
best_sofar: float = 0
75+
for s in scores:
76+
best_sofar = max(best_sofar, s)
77+
best.append(best_sofar)
78+
79+
fig, axes = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
80+
81+
ax1, ax2 = axes
82+
83+
# Top: per-cycle score
84+
ax1.plot(scores, alpha=0.4, color="blue", linewidth=0.8, label="Per-cycle score")
85+
smoothed = rolling_average(scores, ROLLING_WINDOW)
86+
ax1.plot(smoothed, color="blue", linewidth=2, label=f"Rolling avg (w={ROLLING_WINDOW})")
87+
ax1.set_ylabel("Execution Score")
88+
ax1.set_title("Grounded Evolution: Per-Cycle Scores")
89+
ax1.legend()
90+
ax1.grid(True, alpha=0.3)
91+
92+
# Bottom: best-so-far
93+
ax2.plot(best, color="green", linewidth=2, label="Best so far")
94+
ax2.set_xlabel("Cycle")
95+
ax2.set_ylabel("Best Score")
96+
ax2.set_title("Grounded Evolution: Best Score Convergence")
97+
ax2.legend()
98+
ax2.grid(True, alpha=0.3)
99+
100+
fig.tight_layout()
101+
out: Path = CHARTS_DIR / "convergence_main.png"
102+
fig.savefig(out, dpi=150)
103+
plt.close(fig)
104+
print(f"Saved {out}")
105+
106+
107+
def plot_ablation_convergence(conditions: dict[str, list[dict[str, Any]]]) -> None:
108+
"""Plot ablation study comparison: one line per condition."""
109+
try:
110+
import matplotlib
111+
matplotlib.use("Agg")
112+
import matplotlib.pyplot as plt
113+
except ImportError:
114+
print("matplotlib not installed. Install it with: pip install matplotlib")
115+
return
116+
117+
CHARTS_DIR.mkdir(parents=True, exist_ok=True)
118+
119+
fig, axes = plt.subplots(2, 1, figsize=(14, 12))
120+
121+
ax1, ax2 = axes
122+
123+
colors: dict[str, str] = {
124+
"full": "blue",
125+
"mutation_only": "orange",
126+
"crossover_only": "green",
127+
"random_walk": "red",
128+
}
129+
markers: dict[str, str] = {
130+
"full": "o",
131+
"mutation_only": "s",
132+
"crossover_only": "^",
133+
"random_walk": "v",
134+
}
135+
136+
# Top: per-condition best-so-far
137+
for cid, records in sorted(conditions.items()):
138+
scores: list[float] = [r.get("score", 0) for r in records]
139+
best: list[float] = []
140+
best_sofar: float = 0
141+
for s in scores:
142+
best_sofar = max(best_sofar, s)
143+
best.append(best_sofar)
144+
145+
base_cid: str = cid.rsplit("_", 1)[0] if "_" in cid else cid
146+
color: str = colors.get(base_cid, "gray")
147+
marker: str = markers.get(base_cid, ".")
148+
label: str = cid
149+
ax1.plot(best, color=color, linewidth=1.5, label=label, marker=marker, markevery=max(1, len(best) // 10))
150+
151+
ax1.set_ylabel("Best Score")
152+
ax1.set_title("Ablation Study: Best Score Convergence by Condition")
153+
ax1.legend(fontsize=8, ncol=2)
154+
ax1.grid(True, alpha=0.3)
155+
156+
# Bottom: aggregated per-condition (group by condition, average across benchmarks)
157+
condition_scores: dict[str, list[list[float]]] = defaultdict(list)
158+
for cid, records in sorted(conditions.items()):
159+
base_cid = cid.rsplit("_", 1)[0] if "_" in cid else cid
160+
condition_scores[base_cid].append([r.get("score", 0) for r in records])
161+
162+
for cond, all_scores in sorted(condition_scores.items()):
163+
# Average across benchmarks at each cycle
164+
min_len: int = min(len(s) for s in all_scores)
165+
avg_scores: list[float] = [sum(s[i] for s in all_scores) / len(all_scores) for i in range(min_len)]
166+
best_avg: list[float] = []
167+
best_sofar = 0
168+
for s in avg_scores:
169+
best_sofar = max(best_sofar, s)
170+
best_avg.append(best_sofar)
171+
172+
color: str = colors.get(cond, "gray")
173+
marker: str = markers.get(cond, ".")
174+
ax2.plot(best_avg, color=color, linewidth=2.5, label=cond, marker=marker, markevery=max(1, min_len // 8))
175+
176+
ax2.set_xlabel("Cycle")
177+
ax2.set_ylabel("Best Score (avg across benchmarks)")
178+
ax2.set_title("Ablation Study: Aggregate Convergence (averaged across benchmarks)")
179+
ax2.legend(fontsize=10)
180+
ax2.grid(True, alpha=0.3)
181+
182+
fig.tight_layout()
183+
out: Path = CHARTS_DIR / "convergence_ablation.png"
184+
fig.savefig(out, dpi=150)
185+
plt.close(fig)
186+
print(f"Saved {out}")
187+
188+
189+
def main() -> None:
190+
"""Main entry point."""
191+
use_ablation: bool = "--ablation" in sys.argv
192+
rolling_window: int = ROLLING_WINDOW
193+
for arg in sys.argv:
194+
if arg.startswith("--rolling="):
195+
rolling_window = int(arg.split("=")[1])
196+
197+
global ROLLING_WINDOW
198+
ROLLING_WINDOW = rolling_window
199+
200+
if use_ablation:
201+
conditions = load_ablation_runs()
202+
print(f"Loaded {len(conditions)} condition files from experiments/ablation_runs/")
203+
print(f"Conditions: {', '.join(sorted(conditions.keys()))}")
204+
plot_ablation_convergence(conditions)
205+
else:
206+
records = load_main_log()
207+
n_benchmarks = len(set(r.get("benchmark", "?") for r in records))
208+
print(f"Loaded {len(records)} cycles across {n_benchmarks} benchmarks")
209+
plot_main_convergence(records)
210+
211+
print(f"Charts saved to {CHARTS_DIR}/")
212+
213+
214+
if __name__ == "__main__":
215+
main()

evaluator/runtime_evaluator.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,19 @@ def evaluate_project(project_dir: str, benchmark: Benchmark | None = None, timeo
234234

235235
has_test_files: bool = len(list(Path(project_dir).rglob("test_*.py"))) > 0
236236
metrics["has_tests"] = has_test_files
237-
if has_test_files:
238-
score += 3.0
237+
test_quality: float = 0.0
238+
for tf in Path(project_dir).rglob("test_*.py"):
239+
try:
240+
content: str = tf.read_text()
241+
assertion_count: int = content.count("assert ")
242+
placeholder_count: int = content.count("test_placeholder")
243+
real_assertions: int = max(0, assertion_count - placeholder_count)
244+
test_quality += min(real_assertions, 10)
245+
except Exception:
246+
pass
247+
metrics["test_quality"] = round(test_quality / max(1, len(list(Path(project_dir).rglob("test_*.py")))), 1) if list(Path(project_dir).rglob("test_*.py")) else 0.0
248+
if test_quality > 0:
249+
score += min(test_quality, 10.0)
239250

240251
has_readme: bool = (Path(project_dir) / "README.md").exists()
241252
metrics["has_readme"] = has_readme

generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def generate_code(prompt: str, model: str | None = None, temperature: float | No
5454
messages=[
5555
{
5656
"role": "system",
57-
"content": "You are an autonomous software architect. Generate clean executable Python projects. Output each file in a markdown code block with the filename as the language tag (e.g. ```main.py). Include a README.md and requirements.txt.",
57+
"content": "You are an autonomous software architect. Generate clean executable Python projects. CRITICAL: Output each file in a markdown code block with the filename as the language tag (e.g. ```main.py). Include a README.md and requirements.txt. You MUST generate real test files (test_*.py) with real assertions — NO placeholder tests like `def test_placeholder(): assert True`. Every test must call the actual functions being tested with real inputs and verify their behavior with `assert`. Also include type hints on all function signatures and docstrings on all public functions and classes.",
5858
},
5959
{"role": "user", "content": prompt},
6060
],

infinite_research_loop.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,13 @@
2121
from pathlib import Path
2222
from typing import Any
2323

24+
from population_manager import PopulationEntry
25+
2426
sys.stdout.reconfigure(line_buffering=True)
2527
sys.stderr.reconfigure(line_buffering=True)
2628

2729
from generator import generate_code, write_project_files
28-
from mutation_engine import mutate_prompt, crossover_prompts
30+
from mutation_engine import mutate_prompt, crossover_prompts, record_mutation_outcome
2931
from population_manager import (
3032
load_population,
3133
save_population,
@@ -47,7 +49,7 @@
4749
# === Ablation experiment configuration ===
4850
# Set these before running to control which evolution operators are active.
4951
# Each ablation isolates one variable to measure its contribution.
50-
ABLATION: dict[str, bool] = {
52+
ABLATION: dict[str, Any] = {
5153
"mutation": True, # mutate_prompt on selected parent
5254
"crossover": True, # crossover_prompts on two parents
5355
"mutation_rate": 0.7, # probability of mutation when both are enabled
@@ -120,36 +122,48 @@ def append_experiment_log(entry: dict[str, Any]) -> None:
120122
f.write(json.dumps(entry) + "\n")
121123

122124

123-
def evolve_cycle(cycle_num: int, generation: int, ablation_override: dict[str, bool] | None = None) -> float:
125+
def evolve_cycle(
126+
cycle_num: int,
127+
generation: int,
128+
ablation_override: dict[str, bool] | None = None,
129+
benchmark_name: str | None = None,
130+
) -> float:
124131
"""Run one evolution cycle: select, mutate, generate, validate, persist.
125132
126-
ablation_override can be passed to run a specific ablation for this cycle.
133+
ablation_override: overrides the global ABLATION dict for this cycle.
134+
benchmark_name: if set, use this specific benchmark instead of random selection.
127135
Falls back to the global ABLATION dict.
128136
"""
129-
config: dict[str, bool] = ablation_override if ablation_override is not None else ABLATION
137+
config: dict[str, Any] = ablation_override if ablation_override is not None else dict(ABLATION)
130138
population = load_population()
131139
benchmarks: list[Benchmark] = load_benchmarks()
132140

133141
if not population:
134-
population = load_population()
142+
return 0.0
135143

136144
best = select_best(population, k=1)
137145
parent = best[0] if best else population[0]
138-
second = select_tournament(population) if len(population) >= 2 else None
146+
parent_score: float = float(parent.get("score", 0))
147+
second: PopulationEntry | None = select_tournament(population) if len(population) >= 2 else None
139148

140149
mutated_prompt: str = str(parent["prompt"])
141150
applied_mutation: str = "none"
142151
applied_crossover: str | None = None
152+
mutation_desc: str = ""
143153

144154
if config.get("crossover") and second and random.random() > config.get("mutation_rate", 0.7):
145155
mutated_prompt = crossover_prompts(str(parent["prompt"]), str(second["prompt"]))
146156
applied_mutation = "crossover"
147157
applied_crossover = str(second["prompt"])[:80]
148158
elif config.get("mutation"):
149-
mutated_prompt = mutate_prompt(str(parent["prompt"]))
159+
mutated_prompt, mutation_desc = mutate_prompt(str(parent["prompt"]))
150160
applied_mutation = "mutation"
151161

152-
benchmark: Benchmark = random.choice(benchmarks) if random.random() < 0.7 else benchmarks[0]
162+
if benchmark_name:
163+
benchmark_candidates: list[Benchmark] = [b for b in benchmarks if b.get("name") == benchmark_name]
164+
benchmark = benchmark_candidates[0] if benchmark_candidates else benchmarks[0]
165+
else:
166+
benchmark = random.choice(benchmarks)
153167

154168
cycle_start: float = time.time()
155169
metrics, task_dir, files, usage = run_benchmark(mutated_prompt, benchmark, cycle_num)
@@ -169,6 +183,9 @@ def evolve_cycle(cycle_num: int, generation: int, ablation_override: dict[str, b
169183
"score": total_score,
170184
"mutation": applied_mutation,
171185
"crossover_source": applied_crossover,
186+
"ablation_mutation": config.get("mutation", True),
187+
"ablation_crossover": config.get("crossover", True),
188+
"ablation_mutation_rate": config.get("mutation_rate", 0.7),
172189
"files_generated": len(files),
173190
"syntax_valid": metrics.get("syntax", {}).get("valid", False),
174191
"pytest_pass": metrics.get("pytest", {}).get("success", False),
@@ -210,6 +227,10 @@ def evolve_cycle(cycle_num: int, generation: int, ablation_override: dict[str, b
210227
f"Tokens: {usage.get('total_tokens', 0)} | "
211228
f"Time: {cycle_duration:.1f}s"
212229
)
230+
if mutation_desc:
231+
score_delta: float = total_score - parent_score
232+
record_mutation_outcome(mutation_desc, score_delta)
233+
213234
print(summary)
214235
return total_score
215236

0 commit comments

Comments
 (0)