NullLabTests
diff --git a/‎analysis/plot_convergence.py‎
Lines changed: 215 additions & 0 deletions b/‎analysis/plot_convergence.py‎
Lines changed: 215 additions & 0 deletions
diff --git a/‎evaluator/runtime_evaluator.py‎
Lines changed: 13 additions & 2 deletions b/‎evaluator/runtime_evaluator.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎generator.py‎
Lines changed: 1 addition & 1 deletion b/‎generator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎infinite_research_loop.py‎
Lines changed: 30 additions & 9 deletions b/‎infinite_research_loop.py‎
Lines changed: 30 additions & 9 deletions
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""Plot grounded evolution convergence curves from experiment data.
+
+Usage:
+    python analysis/plot_convergence.py                          # Use main run_log.jsonl
+    python analysis/plot_convergence.py --ablation               # Use per-condition files
+    python analysis/plot_convergence.py --ablation --rolling 5   # Rolling average
+
+Output: PNG files in analysis/charts/
+"""
+
+import json
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+
+CHARTS_DIR: Path = Path("analysis/charts")
+ROLLING_WINDOW: int = 10  # default rolling average window
+
+
+def load_main_log() -> list[dict[str, Any]]:
+    """Load all cycles from the main experiment log."""
+    log_path: Path = Path("experiments/run_log.jsonl")
+    if not log_path.exists():
+        print("No experiment log found at experiments/run_log.jsonl")
+        sys.exit(1)
+    return [json.loads(line) for line in log_path.read_text().strip().splitlines() if line]
+
+
+def load_ablation_runs() -> dict[str, list[dict[str, Any]]]:
+    """Load per-condition results from experiments/ablation_runs/*.jsonl."""
+    runs_dir: Path = Path("experiments/ablation_runs")
+    if not runs_dir.exists():
+        print("No ablation runs found at experiments/ablation_runs/")
+        sys.exit(1)
+
+    results: dict[str, list[dict[str, Any]]] = {}
+    for fpath in sorted(runs_dir.glob("*.jsonl")):
+        condition: str = fpath.stem
+        results[condition] = [
+            json.loads(line) for line in fpath.read_text().strip().splitlines() if line
+        ]
+    return results
+
+
+def rolling_average(values: list[float], window: int) -> list[float]:
+    """Compute rolling average with the given window size."""
+    if not values or window <= 1:
+        return list(values)
+    smoothed: list[float] = []
+    for i in range(len(values)):
+        start: int = max(0, i - window + 1)
+        chunk: list[float] = values[start:i + 1]
+        smoothed.append(sum(chunk) / len(chunk))
+    return smoothed
+
+
+def plot_main_convergence(records: list[dict[str, Any]]) -> None:
+    """Plot overall score vs cycles from the main log."""
+    try:
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("matplotlib not installed. Install it with: pip install matplotlib")
+        return
+
+    CHARTS_DIR.mkdir(parents=True, exist_ok=True)
+
+    scores: list[float] = [r.get("score", 0) for r in records]
+    best: list[float] = []
+    best_sofar: float = 0
+    for s in scores:
+        best_sofar = max(best_sofar, s)
+        best.append(best_sofar)
+
+    fig, axes = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
+
+    ax1, ax2 = axes
+
+    # Top: per-cycle score
+    ax1.plot(scores, alpha=0.4, color="blue", linewidth=0.8, label="Per-cycle score")
+    smoothed = rolling_average(scores, ROLLING_WINDOW)
+    ax1.plot(smoothed, color="blue", linewidth=2, label=f"Rolling avg (w={ROLLING_WINDOW})")
+    ax1.set_ylabel("Execution Score")
+    ax1.set_title("Grounded Evolution: Per-Cycle Scores")
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+
+    # Bottom: best-so-far
+    ax2.plot(best, color="green", linewidth=2, label="Best so far")
+    ax2.set_xlabel("Cycle")
+    ax2.set_ylabel("Best Score")
+    ax2.set_title("Grounded Evolution: Best Score Convergence")
+    ax2.legend()
+    ax2.grid(True, alpha=0.3)
+
+    fig.tight_layout()
+    out: Path = CHARTS_DIR / "convergence_main.png"
+    fig.savefig(out, dpi=150)
+    plt.close(fig)
+    print(f"Saved {out}")
+
+
+def plot_ablation_convergence(conditions: dict[str, list[dict[str, Any]]]) -> None:
+    """Plot ablation study comparison: one line per condition."""
+    try:
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("matplotlib not installed. Install it with: pip install matplotlib")
+        return
+
+    CHARTS_DIR.mkdir(parents=True, exist_ok=True)
+
+    fig, axes = plt.subplots(2, 1, figsize=(14, 12))
+
+    ax1, ax2 = axes
+
+    colors: dict[str, str] = {
+        "full": "blue",
+        "mutation_only": "orange",
+        "crossover_only": "green",
+        "random_walk": "red",
+    }
+    markers: dict[str, str] = {
+        "full": "o",
+        "mutation_only": "s",
+        "crossover_only": "^",
+        "random_walk": "v",
+    }
+
+    # Top: per-condition best-so-far
+    for cid, records in sorted(conditions.items()):
+        scores: list[float] = [r.get("score", 0) for r in records]
+        best: list[float] = []
+        best_sofar: float = 0
+        for s in scores:
+            best_sofar = max(best_sofar, s)
+            best.append(best_sofar)
+
+        base_cid: str = cid.rsplit("_", 1)[0] if "_" in cid else cid
+        color: str = colors.get(base_cid, "gray")
+        marker: str = markers.get(base_cid, ".")
+        label: str = cid
+        ax1.plot(best, color=color, linewidth=1.5, label=label, marker=marker, markevery=max(1, len(best) // 10))
+
+    ax1.set_ylabel("Best Score")
+    ax1.set_title("Ablation Study: Best Score Convergence by Condition")
+    ax1.legend(fontsize=8, ncol=2)
+    ax1.grid(True, alpha=0.3)
+
+    # Bottom: aggregated per-condition (group by condition, average across benchmarks)
+    condition_scores: dict[str, list[list[float]]] = defaultdict(list)
+    for cid, records in sorted(conditions.items()):
+        base_cid = cid.rsplit("_", 1)[0] if "_" in cid else cid
+        condition_scores[base_cid].append([r.get("score", 0) for r in records])
+
+    for cond, all_scores in sorted(condition_scores.items()):
+        # Average across benchmarks at each cycle
+        min_len: int = min(len(s) for s in all_scores)
+        avg_scores: list[float] = [sum(s[i] for s in all_scores) / len(all_scores) for i in range(min_len)]
+        best_avg: list[float] = []
+        best_sofar = 0
+        for s in avg_scores:
+            best_sofar = max(best_sofar, s)
+            best_avg.append(best_sofar)
+
+        color: str = colors.get(cond, "gray")
+        marker: str = markers.get(cond, ".")
+        ax2.plot(best_avg, color=color, linewidth=2.5, label=cond, marker=marker, markevery=max(1, min_len // 8))
+
+    ax2.set_xlabel("Cycle")
+    ax2.set_ylabel("Best Score (avg across benchmarks)")
+    ax2.set_title("Ablation Study: Aggregate Convergence (averaged across benchmarks)")
+    ax2.legend(fontsize=10)
+    ax2.grid(True, alpha=0.3)
+
+    fig.tight_layout()
+    out: Path = CHARTS_DIR / "convergence_ablation.png"
+    fig.savefig(out, dpi=150)
+    plt.close(fig)
+    print(f"Saved {out}")
+
+
+def main() -> None:
+    """Main entry point."""
+    use_ablation: bool = "--ablation" in sys.argv
+    rolling_window: int = ROLLING_WINDOW
+    for arg in sys.argv:
+        if arg.startswith("--rolling="):
+            rolling_window = int(arg.split("=")[1])
+
+    global ROLLING_WINDOW
+    ROLLING_WINDOW = rolling_window
+
+    if use_ablation:
+        conditions = load_ablation_runs()
+        print(f"Loaded {len(conditions)} condition files from experiments/ablation_runs/")
+        print(f"Conditions: {', '.join(sorted(conditions.keys()))}")
+        plot_ablation_convergence(conditions)
+    else:
+        records = load_main_log()
+        n_benchmarks = len(set(r.get("benchmark", "?") for r in records))
+        print(f"Loaded {len(records)} cycles across {n_benchmarks} benchmarks")
+        plot_main_convergence(records)
+
+    print(f"Charts saved to {CHARTS_DIR}/")
+
+
+if __name__ == "__main__":
+    main()
@@ -234,8 +234,19 @@ def evaluate_project(project_dir: str, benchmark: Benchmark | None = None, timeo
 
     has_test_files: bool = len(list(Path(project_dir).rglob("test_*.py"))) > 0
     metrics["has_tests"] = has_test_files
-    if has_test_files:
-        score += 3.0
+    test_quality: float = 0.0
+    for tf in Path(project_dir).rglob("test_*.py"):
+        try:
+            content: str = tf.read_text()
+            assertion_count: int = content.count("assert ")
+            placeholder_count: int = content.count("test_placeholder")
+            real_assertions: int = max(0, assertion_count - placeholder_count)
+            test_quality += min(real_assertions, 10)
+        except Exception:
+            pass
+    metrics["test_quality"] = round(test_quality / max(1, len(list(Path(project_dir).rglob("test_*.py")))), 1) if list(Path(project_dir).rglob("test_*.py")) else 0.0
+    if test_quality > 0:
+        score += min(test_quality, 10.0)
 
     has_readme: bool = (Path(project_dir) / "README.md").exists()
     metrics["has_readme"] = has_readme
 
@@ -54,7 +54,7 @@ def generate_code(prompt: str, model: str | None = None, temperature: float | No
         messages=[
             {
                 "role": "system",
-                "content": "You are an autonomous software architect. Generate clean executable Python projects. Output each file in a markdown code block with the filename as the language tag (e.g. ```main.py). Include a README.md and requirements.txt.",
+                "content": "You are an autonomous software architect. Generate clean executable Python projects. CRITICAL: Output each file in a markdown code block with the filename as the language tag (e.g. ```main.py). Include a README.md and requirements.txt. You MUST generate real test files (test_*.py) with real assertions — NO placeholder tests like `def test_placeholder(): assert True`. Every test must call the actual functions being tested with real inputs and verify their behavior with `assert`. Also include type hints on all function signatures and docstrings on all public functions and classes.",
             },
             {"role": "user", "content": prompt},
         ],
 
@@ -21,11 +21,13 @@
 from pathlib import Path
 from typing import Any
 
+from population_manager import PopulationEntry
+
 sys.stdout.reconfigure(line_buffering=True)
 sys.stderr.reconfigure(line_buffering=True)
 
 from generator import generate_code, write_project_files
-from mutation_engine import mutate_prompt, crossover_prompts
+from mutation_engine import mutate_prompt, crossover_prompts, record_mutation_outcome
 from population_manager import (
     load_population,
     save_population,
@@ -47,7 +49,7 @@
 # === Ablation experiment configuration ===
 # Set these before running to control which evolution operators are active.
 # Each ablation isolates one variable to measure its contribution.
-ABLATION: dict[str, bool] = {
+ABLATION: dict[str, Any] = {
     "mutation": True,       # mutate_prompt on selected parent
     "crossover": True,      # crossover_prompts on two parents
     "mutation_rate": 0.7,   # probability of mutation when both are enabled
@@ -120,36 +122,48 @@ def append_experiment_log(entry: dict[str, Any]) -> None:
         f.write(json.dumps(entry) + "\n")
 
 
-def evolve_cycle(cycle_num: int, generation: int, ablation_override: dict[str, bool] | None = None) -> float:
+def evolve_cycle(
+    cycle_num: int,
+    generation: int,
+    ablation_override: dict[str, bool] | None = None,
+    benchmark_name: str | None = None,
+) -> float:
     """Run one evolution cycle: select, mutate, generate, validate, persist.
 
-    ablation_override can be passed to run a specific ablation for this cycle.
+    ablation_override: overrides the global ABLATION dict for this cycle.
+    benchmark_name:      if set, use this specific benchmark instead of random selection.
     Falls back to the global ABLATION dict.
     """
-    config: dict[str, bool] = ablation_override if ablation_override is not None else ABLATION
+    config: dict[str, Any] = ablation_override if ablation_override is not None else dict(ABLATION)
     population = load_population()
     benchmarks: list[Benchmark] = load_benchmarks()
 
     if not population:
-        population = load_population()
+        return 0.0
 
     best = select_best(population, k=1)
     parent = best[0] if best else population[0]
-    second = select_tournament(population) if len(population) >= 2 else None
+    parent_score: float = float(parent.get("score", 0))
+    second: PopulationEntry | None = select_tournament(population) if len(population) >= 2 else None
 
     mutated_prompt: str = str(parent["prompt"])
     applied_mutation: str = "none"
     applied_crossover: str | None = None
+    mutation_desc: str = ""
 
     if config.get("crossover") and second and random.random() > config.get("mutation_rate", 0.7):
         mutated_prompt = crossover_prompts(str(parent["prompt"]), str(second["prompt"]))
         applied_mutation = "crossover"
         applied_crossover = str(second["prompt"])[:80]
     elif config.get("mutation"):
-        mutated_prompt = mutate_prompt(str(parent["prompt"]))
+        mutated_prompt, mutation_desc = mutate_prompt(str(parent["prompt"]))
         applied_mutation = "mutation"
 
-    benchmark: Benchmark = random.choice(benchmarks) if random.random() < 0.7 else benchmarks[0]
+    if benchmark_name:
+        benchmark_candidates: list[Benchmark] = [b for b in benchmarks if b.get("name") == benchmark_name]
+        benchmark = benchmark_candidates[0] if benchmark_candidates else benchmarks[0]
+    else:
+        benchmark = random.choice(benchmarks)
 
     cycle_start: float = time.time()
     metrics, task_dir, files, usage = run_benchmark(mutated_prompt, benchmark, cycle_num)
@@ -169,6 +183,9 @@ def evolve_cycle(cycle_num: int, generation: int, ablation_override: dict[str, b
         "score": total_score,
         "mutation": applied_mutation,
         "crossover_source": applied_crossover,
+        "ablation_mutation": config.get("mutation", True),
+        "ablation_crossover": config.get("crossover", True),
+        "ablation_mutation_rate": config.get("mutation_rate", 0.7),
         "files_generated": len(files),
         "syntax_valid": metrics.get("syntax", {}).get("valid", False),
         "pytest_pass": metrics.get("pytest", {}).get("success", False),
@@ -210,6 +227,10 @@ def evolve_cycle(cycle_num: int, generation: int, ablation_override: dict[str, b
         f"Tokens: {usage.get('total_tokens', 0)} | "
         f"Time: {cycle_duration:.1f}s"
     )
+    if mutation_desc:
+        score_delta: float = total_score - parent_score
+        record_mutation_outcome(mutation_desc, score_delta)
+
     print(summary)
     return total_score
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def generate_code(prompt: str, model: str \| None = None, temperature: float \| No`
`54`	`54`	`messages=[`
`55`	`55`	`{`
`56`	`56`	`"role": "system",`
`57`		- "content": "You are an autonomous software architect. Generate clean executable Python projects. Output each file in a markdown code block with the filename as the language tag (e.g. ```main.py). Include a README.md and requirements.txt.",
	`57`	+ "content": "You are an autonomous software architect. Generate clean executable Python projects. CRITICAL: Output each file in a markdown code block with the filename as the language tag (e.g. ```main.py). Include a README.md and requirements.txt. You MUST generate real test files (test_*.py) with real assertions — NO placeholder tests like `def test_placeholder(): assert True`. Every test must call the actual functions being tested with real inputs and verify their behavior with `assert`. Also include type hints on all function signatures and docstrings on all public functions and classes.",
`58`	`58`	`},`
`59`	`59`	`{"role": "user", "content": prompt},`
`60`	`60`	`],`