README: add experiment runner and visualization docs

NullLabTests · NullLabTests · commit e87b5a097d8b · 2026-05-28T18:18:38.000+01:00
diff --git a/README.md b/README.md
@@ -371,6 +371,36 @@ This starts the infinite loop:
 4. Updates scores with execution results
 5. Auto-commits improvements to git
 
+### Ablation Experiments
+
+```bash
+# Run the full experiment grid (4 ablations × 3 benchmarks)
+python run_experiment.py
+
+# Quick smoke test (10 cycles per condition)
+python run_experiment.py --quick
+
+# Preview what will run
+python run_experiment.py --list
+
+# Resume incomplete runs
+python run_experiment.py --resume
+```
+
+Results are logged to `experiments/run_log.jsonl` and per-condition files in `experiments/ablation_runs/`.
+
+### Visualize Convergence
+
+```bash
+# Plot from main experiment log
+python analysis/plot_convergence.py
+
+# Plot from ablation runs with rolling average
+python analysis/plot_convergence.py --ablation --rolling=5
+```
+
+Charts saved to `analysis/charts/`. Requires `matplotlib`.
+
 ### Manual Evolution
 
 ```bash
diff --git a/analysis/charts/.gitkeep b/analysis/charts/.gitkeep
diff --git a/benchmarks/tasks.json b/benchmarks/tasks.json
@@ -22,5 +22,37 @@
     "hidden_test_files": {
       "test_hidden_pipeline.py": "import subprocess\nimport json\nimport os\nimport tempfile\n\n\ndef test_read_csv():\n    from main import read_csv\n    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:\n        f.write('name,age,city\\nAlice,30,NYC\\nBob,25,LA\\n')\n        path = f.name\n    data = read_csv(path)\n    os.unlink(path)\n    assert len(data) == 2\n    assert data[0]['name'] == 'Alice'\n\n\ndef test_filter_data():\n    from main import filter_data\n    data = [{'name': 'Alice', 'status': 'active'}, {'name': 'Bob', 'status': 'inactive'}]\n    result = filter_data(data, 'status', 'active')\n    assert len(result) == 1\n    assert result[0]['name'] == 'Alice'\n\n\ndef test_aggregate_sum():\n    from main import aggregate\n    data = [{'dept': 'eng', 'salary': 100}, {'dept': 'eng', 'salary': 200}, {'dept': 'hr', 'salary': 150}]\n    result = aggregate(data, 'dept', 'salary', 'sum')\n    assert result['eng'] == 300\n    assert result['hr'] == 150\n\n\ndef test_aggregate_avg():\n    from main import aggregate\n    data = [{'dept': 'eng', 'salary': 100}, {'dept': 'eng', 'salary': 200}]\n    result = aggregate(data, 'dept', 'salary', 'avg')\n    assert result['eng'] == 150\n\n\ndef test_main_pipeline():\n    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:\n        f.write('name,department,salary,status\\nAlice,Engineering,100000,active\\nBob,Engineering,120000,active\\nCharlie,HR,80000,inactive\\n')\n        input_path = f.name\n    output_path = input_path.replace('.csv', '_report.json')\n    result = subprocess.run(['python', 'main.py', input_path], capture_output=True, text=True, timeout=10)\n    os.unlink(input_path)\n    assert result.returncode == 0, f'pipeline failed: {result.stderr}'\n    assert os.path.exists(output_path), f'report not created at {output_path}'\n    with open(output_path) as f:\n        report = json.load(f)\n    os.unlink(output_path)\n    assert isinstance(report, dict)\n"
     }
+  },
+  {
+    "name": "flask_api",
+    "prompt": "Create a Flask REST API in app.py. Must have these endpoints: GET /items returns list of all items (JSON array), POST /items creates a new item from JSON body with 'name' and 'price' fields and returns the created item with a generated id, GET /items/<id> returns a single item by id, PUT /items/<id> updates an item's fields, DELETE /items/<id> deletes an item. Data must persist in memory using a Python list/dict. Each item must have id (auto-incrementing integer), name (string), price (float), created_at (ISO timestamp string). Include proper error handling with 404 for missing items and 400 for bad input. Main entry point in run.py that imports app and runs app.run(port=5000).",
+    "requirements": ["pytest", "flask"],
+    "hidden_test_files": {
+      "test_hidden_api.py": "import subprocess\nimport json\nimport sys\nimport os\n\n\ndef test_import():\n    import importlib\n    spec = importlib.util.find_spec('app')\n    assert spec is not None, 'app.py not found'\n\n\ndef test_create_item():\n    result = subprocess.run(\n        [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.post(\"/items\", json={\"name\": \"test\", \"price\": 9.99})\nprint(resp.status_code)\ndata = json.loads(resp.data)\nprint(data[\"name\"], data[\"price\"])\n'''],\n        capture_output=True, text=True, timeout=10\n    )\n    assert result.returncode == 0, f'failed: {result.stderr}'\n\n\ndef test_list_items():\n    result = subprocess.run(\n        [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.get(\"/items\")\ndata = json.loads(resp.data)\nprint(len(data))\n'''],\n        capture_output=True, text=True, timeout=10\n    )\n    assert result.returncode == 0\n\n\ndef test_get_single_item():\n    result = subprocess.run(\n        [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.get(\"/items/999\")\nassert resp.status_code == 404\nprint(\"correctly returned 404\")\n'''],\n        capture_output=True, text=True, timeout=10\n    )\n    assert result.returncode == 0\n    assert '404' in result.stdout or 'correctly' in result.stdout\n\n\ndef test_delete_item():\n    result = subprocess.run(\n        [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.delete(\"/items/999\")\nassert resp.status_code == 404\nprint(\"delete 404 works\")\n'''],\n        capture_output=True, text=True, timeout=10\n    )\n    assert result.returncode == 0\n"
+    }
+  },
+  {
+    "name": "math_library",
+    "prompt": "Create a pure Python math library in mathlib.py. Must have these functions: fibonacci(n) returns the nth Fibonacci number (0-indexed, fib(0)=0, fib(1)=1) using an efficient algorithm (not naive recursion), is_prime(n) returns True if n is prime, gcd(a, b) returns the greatest common divisor using Euclidean algorithm, lcm(a, b) returns the least common multiple, factorial(n) returns n! (product of 1..n), is_palindrome(n) returns True if the number reads the same forward and backward. Use integer arithmetic only (no floating point). Provide main.py that reads a command and argument from sys.argv and prints the result (e.g. python main.py fibonacci 10 -> 55).",
+    "requirements": ["pytest"],
+    "hidden_test_files": {
+      "test_hidden_math.py": "import subprocess\nimport sys\n\n\ndef test_fibonacci():\n    from mathlib import fibonacci\n    assert fibonacci(0) == 0\n    assert fibonacci(1) == 1\n    assert fibonacci(10) == 55\n    assert fibonacci(20) == 6765\n\n\ndef test_is_prime():\n    from mathlib import is_prime\n    assert not is_prime(1)\n    assert is_prime(2)\n    assert is_prime(17)\n    assert not is_prime(100)\n\n\ndef test_gcd():\n    from mathlib import gcd\n    assert gcd(48, 18) == 6\n    assert gcd(0, 5) == 5\n    assert gcd(7, 13) == 1\n\n\ndef test_lcm():\n    from mathlib import lcm\n    assert lcm(4, 6) == 12\n    assert lcm(7, 13) == 91\n\n\ndef test_factorial():\n    from mathlib import factorial\n    assert factorial(0) == 1\n    assert factorial(5) == 120\n    assert factorial(10) == 3628800\n\n\ndef test_is_palindrome():\n    from mathlib import is_palindrome\n    assert is_palindrome(121)\n    assert not is_palindrome(123)\n    assert is_palindrome(1)\n\n\ndef test_main_fibonacci():\n    result = subprocess.run([sys.executable, 'main.py', 'fibonacci', '10'], capture_output=True, text=True, timeout=5)\n    assert result.returncode == 0\n    assert '55' in result.stdout or '55' in result.stdout\n\n\ndef test_main_prime():\n    result = subprocess.run([sys.executable, 'main.py', 'is_prime', '17'], capture_output=True, text=True, timeout=5)\n    assert result.returncode == 0\n    assert 'True' in result.stdout\n"
+    }
+  },
+  {
+    "name": "file_indexer",
+    "prompt": "Create a Python file indexer in indexer.py. Must have functions: index_directory(path) that walks a directory recursively and returns a dict mapping filename -> {size, modified, lines} where lines is the number of lines in the file. Must use os.walk. search_by_name(index, pattern) that returns files matching a glob pattern (support * and ?). search_by_size(index, min_size, max_size) that filters files by size in bytes. search_duplicates(index) that finds files with the same size (potential duplicates). stats(index) returns total file count, total size, average file size. Provide main.py that accepts a directory path from sys.argv and prints a summary report.",
+    "requirements": ["pytest"],
+    "hidden_test_files": {
+      "test_hidden_indexer.py": "import os\nimport sys\nimport tempfile\n\n\ndef setup_index(tmp_dir):\n    from indexer import index_directory\n    os.makedirs(tmp_dir + '/sub', exist_ok=True)\n    with open(tmp_dir + '/a.txt', 'w') as f: f.write('hello\\nworld\\n')\n    with open(tmp_dir + '/b.py', 'w') as f: f.write('def foo():\\n    pass\\n')\n    with open(tmp_dir + '/sub/c.txt', 'w') as f: f.write('test')\n    return index_directory(tmp_dir)\n\n\ndef test_index_directory():\n    with tempfile.TemporaryDirectory() as tmp:\n        idx = setup_index(tmp)\n        assert len(idx) == 3\n        assert any('a.txt' in k for k in idx)\n        assert any('b.py' in k for k in idx)\n\n\ndef test_search_by_name():\n    from indexer import search_by_name\n    with tempfile.TemporaryDirectory() as tmp:\n        idx = setup_index(tmp)\n        result = search_by_name(idx, '*.txt')\n        assert len(result) >= 2\n\n\ndef test_search_by_size():\n    from indexer import search_by_size\n    with tempfile.TemporaryDirectory() as tmp:\n        idx = setup_index(tmp)\n        result = search_by_size(idx, 0, 100)\n        assert len(result) >= 1\n\n\ndef test_stats():\n    from indexer import stats\n    with tempfile.TemporaryDirectory() as tmp:\n        idx = setup_index(tmp)\n        s = stats(idx)\n        assert s['total_files'] >= 3\n        assert s['total_size'] > 10\n        assert s['avg_size'] > 0\n\n\ndef test_search_duplicates():\n    from indexer import search_duplicates\n    with tempfile.TemporaryDirectory() as tmp:\n        os.makedirs(tmp + '/sub', exist_ok=True)\n        with open(tmp + '/a.txt', 'w') as f: f.write('same content')\n        with open(tmp + '/b.txt', 'w') as f: f.write('same content')\n        with open(tmp + '/c.txt', 'w') as f: f.write('different')\n        idx = {}\n        for fn in os.listdir(tmp):\n            fp = os.path.join(tmp, fn)\n            if os.path.isfile(fp):\n                idx[fn] = {'size': os.path.getsize(fp), 'modified': os.path.getmtime(fp), 'lines': 1}\n        dups = search_duplicates(idx)\n        assert len(dups) >= 1\n"
+    }
+  },
+  {
+    "name": "caching_layer",
+    "prompt": "Create a Python caching library in cache.py. Implement a Cache class with: __init__(self, max_size=100, ttl_seconds=300) that sets max cache entries and default TTL. get(key) returns the value if key exists and is not expired, else None. set(key, value, ttl=None) stores a value with optional TTL override. delete(key) removes a key. clear() removes all entries. keys() returns list of valid (non-expired) keys. size property returns number of valid entries. Must be thread-safe using threading.Lock. Must track hit/miss counts and expose stats() returning a dict with hits, misses, hit_ratio, size, max_size. Provide main.py that demonstrates the cache with a simple CLI: python main.py set mykey myvalue, python main.py get mykey.",
+    "requirements": ["pytest"],
+    "hidden_test_files": {
+      "test_hidden_cache.py": "import time\nimport sys\n\n\ndef test_cache_basic():\n    from cache import Cache\n    c = Cache(max_size=10, ttl_seconds=60)\n    c.set('key1', 'value1')\n    assert c.get('key1') == 'value1'\n    assert c.get('nonexistent') is None\n\n\ndef test_cache_ttl():\n    from cache import Cache\n    c = Cache(max_size=10, ttl_seconds=1)\n    c.set('key1', 'value1')\n    assert c.get('key1') == 'value1'\n    time.sleep(1.5)\n    assert c.get('key1') is None\n\n\ndef test_cache_delete():\n    from cache import Cache\n    c = Cache()\n    c.set('key1', 'value1')\n    c.delete('key1')\n    assert c.get('key1') is None\n\n\ndef test_cache_clear():\n    from cache import Cache\n    c = Cache()\n    c.set('a', 1)\n    c.set('b', 2)\n    c.clear()\n    assert c.size == 0\n\n\ndef test_cache_max_size():\n    from cache import Cache\n    c = Cache(max_size=2)\n    c.set('a', 1)\n    c.set('b', 2)\n    c.set('c', 3)\n    assert c.size <= 2\n\n\ndef test_cache_stats():\n    from cache import Cache\n    c = Cache()\n    c.set('x', 1)\n    c.get('x')\n    c.get('y')\n    s = c.stats()\n    assert s['hits'] == 1\n    assert s['misses'] == 1\n    assert s['hit_ratio'] == 0.5\n\n\ndef test_cache_thread_safe():\n    from cache import Cache\n    c = Cache()\n    import threading\n    errors = []\n    def worker():\n        for i in range(100):\n            try:\n                c.set(str(i), i)\n                c.get(str(i))\n            except Exception as e:\n                errors.append(e)\n    threads = [threading.Thread(target=worker) for _ in range(4)]\n    for t in threads: t.start()\n    for t in threads: t.join()\n    assert len(errors) == 0, f'thread safety errors: {errors}'\n\n\ndef test_main_set_get():\n    result = subprocess.run([sys.executable, 'main.py', 'set', 'testkey', 'testval'], capture_output=True, text=True, timeout=5)\n    result2 = subprocess.run([sys.executable, 'main.py', 'get', 'testkey'], capture_output=True, text=True, timeout=5)\n    assert result.returncode == 0\n    assert result2.returncode == 0\n"
+    }
   }
 ]