Skip to content

Commit e87b5a0

Browse files
committed
README: add experiment runner and visualization docs
1 parent ed4f59f commit e87b5a0

3 files changed

Lines changed: 62 additions & 0 deletions

File tree

README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,36 @@ This starts the infinite loop:
371371
4. Updates scores with execution results
372372
5. Auto-commits improvements to git
373373

374+
### Ablation Experiments
375+
376+
```bash
377+
# Run the full experiment grid (4 ablations × 3 benchmarks)
378+
python run_experiment.py
379+
380+
# Quick smoke test (10 cycles per condition)
381+
python run_experiment.py --quick
382+
383+
# Preview what will run
384+
python run_experiment.py --list
385+
386+
# Resume incomplete runs
387+
python run_experiment.py --resume
388+
```
389+
390+
Results are logged to `experiments/run_log.jsonl` and per-condition files in `experiments/ablation_runs/`.
391+
392+
### Visualize Convergence
393+
394+
```bash
395+
# Plot from main experiment log
396+
python analysis/plot_convergence.py
397+
398+
# Plot from ablation runs with rolling average
399+
python analysis/plot_convergence.py --ablation --rolling=5
400+
```
401+
402+
Charts saved to `analysis/charts/`. Requires `matplotlib`.
403+
374404
### Manual Evolution
375405

376406
```bash

analysis/charts/.gitkeep

Whitespace-only changes.

benchmarks/tasks.json

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,37 @@
2222
"hidden_test_files": {
2323
"test_hidden_pipeline.py": "import subprocess\nimport json\nimport os\nimport tempfile\n\n\ndef test_read_csv():\n from main import read_csv\n with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:\n f.write('name,age,city\\nAlice,30,NYC\\nBob,25,LA\\n')\n path = f.name\n data = read_csv(path)\n os.unlink(path)\n assert len(data) == 2\n assert data[0]['name'] == 'Alice'\n\n\ndef test_filter_data():\n from main import filter_data\n data = [{'name': 'Alice', 'status': 'active'}, {'name': 'Bob', 'status': 'inactive'}]\n result = filter_data(data, 'status', 'active')\n assert len(result) == 1\n assert result[0]['name'] == 'Alice'\n\n\ndef test_aggregate_sum():\n from main import aggregate\n data = [{'dept': 'eng', 'salary': 100}, {'dept': 'eng', 'salary': 200}, {'dept': 'hr', 'salary': 150}]\n result = aggregate(data, 'dept', 'salary', 'sum')\n assert result['eng'] == 300\n assert result['hr'] == 150\n\n\ndef test_aggregate_avg():\n from main import aggregate\n data = [{'dept': 'eng', 'salary': 100}, {'dept': 'eng', 'salary': 200}]\n result = aggregate(data, 'dept', 'salary', 'avg')\n assert result['eng'] == 150\n\n\ndef test_main_pipeline():\n with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:\n f.write('name,department,salary,status\\nAlice,Engineering,100000,active\\nBob,Engineering,120000,active\\nCharlie,HR,80000,inactive\\n')\n input_path = f.name\n output_path = input_path.replace('.csv', '_report.json')\n result = subprocess.run(['python', 'main.py', input_path], capture_output=True, text=True, timeout=10)\n os.unlink(input_path)\n assert result.returncode == 0, f'pipeline failed: {result.stderr}'\n assert os.path.exists(output_path), f'report not created at {output_path}'\n with open(output_path) as f:\n report = json.load(f)\n os.unlink(output_path)\n assert isinstance(report, dict)\n"
2424
}
25+
},
26+
{
27+
"name": "flask_api",
28+
"prompt": "Create a Flask REST API in app.py. Must have these endpoints: GET /items returns list of all items (JSON array), POST /items creates a new item from JSON body with 'name' and 'price' fields and returns the created item with a generated id, GET /items/<id> returns a single item by id, PUT /items/<id> updates an item's fields, DELETE /items/<id> deletes an item. Data must persist in memory using a Python list/dict. Each item must have id (auto-incrementing integer), name (string), price (float), created_at (ISO timestamp string). Include proper error handling with 404 for missing items and 400 for bad input. Main entry point in run.py that imports app and runs app.run(port=5000).",
29+
"requirements": ["pytest", "flask"],
30+
"hidden_test_files": {
31+
"test_hidden_api.py": "import subprocess\nimport json\nimport sys\nimport os\n\n\ndef test_import():\n import importlib\n spec = importlib.util.find_spec('app')\n assert spec is not None, 'app.py not found'\n\n\ndef test_create_item():\n result = subprocess.run(\n [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.post(\"/items\", json={\"name\": \"test\", \"price\": 9.99})\nprint(resp.status_code)\ndata = json.loads(resp.data)\nprint(data[\"name\"], data[\"price\"])\n'''],\n capture_output=True, text=True, timeout=10\n )\n assert result.returncode == 0, f'failed: {result.stderr}'\n\n\ndef test_list_items():\n result = subprocess.run(\n [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.get(\"/items\")\ndata = json.loads(resp.data)\nprint(len(data))\n'''],\n capture_output=True, text=True, timeout=10\n )\n assert result.returncode == 0\n\n\ndef test_get_single_item():\n result = subprocess.run(\n [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.get(\"/items/999\")\nassert resp.status_code == 404\nprint(\"correctly returned 404\")\n'''],\n capture_output=True, text=True, timeout=10\n )\n assert result.returncode == 0\n assert '404' in result.stdout or 'correctly' in result.stdout\n\n\ndef test_delete_item():\n result = subprocess.run(\n [sys.executable, '-c', '''\nimport json\nfrom app import app\nclient = app.test_client()\nresp = client.delete(\"/items/999\")\nassert resp.status_code == 404\nprint(\"delete 404 works\")\n'''],\n capture_output=True, text=True, timeout=10\n )\n assert result.returncode == 0\n"
32+
}
33+
},
34+
{
35+
"name": "math_library",
36+
"prompt": "Create a pure Python math library in mathlib.py. Must have these functions: fibonacci(n) returns the nth Fibonacci number (0-indexed, fib(0)=0, fib(1)=1) using an efficient algorithm (not naive recursion), is_prime(n) returns True if n is prime, gcd(a, b) returns the greatest common divisor using Euclidean algorithm, lcm(a, b) returns the least common multiple, factorial(n) returns n! (product of 1..n), is_palindrome(n) returns True if the number reads the same forward and backward. Use integer arithmetic only (no floating point). Provide main.py that reads a command and argument from sys.argv and prints the result (e.g. python main.py fibonacci 10 -> 55).",
37+
"requirements": ["pytest"],
38+
"hidden_test_files": {
39+
"test_hidden_math.py": "import subprocess\nimport sys\n\n\ndef test_fibonacci():\n from mathlib import fibonacci\n assert fibonacci(0) == 0\n assert fibonacci(1) == 1\n assert fibonacci(10) == 55\n assert fibonacci(20) == 6765\n\n\ndef test_is_prime():\n from mathlib import is_prime\n assert not is_prime(1)\n assert is_prime(2)\n assert is_prime(17)\n assert not is_prime(100)\n\n\ndef test_gcd():\n from mathlib import gcd\n assert gcd(48, 18) == 6\n assert gcd(0, 5) == 5\n assert gcd(7, 13) == 1\n\n\ndef test_lcm():\n from mathlib import lcm\n assert lcm(4, 6) == 12\n assert lcm(7, 13) == 91\n\n\ndef test_factorial():\n from mathlib import factorial\n assert factorial(0) == 1\n assert factorial(5) == 120\n assert factorial(10) == 3628800\n\n\ndef test_is_palindrome():\n from mathlib import is_palindrome\n assert is_palindrome(121)\n assert not is_palindrome(123)\n assert is_palindrome(1)\n\n\ndef test_main_fibonacci():\n result = subprocess.run([sys.executable, 'main.py', 'fibonacci', '10'], capture_output=True, text=True, timeout=5)\n assert result.returncode == 0\n assert '55' in result.stdout or '55' in result.stdout\n\n\ndef test_main_prime():\n result = subprocess.run([sys.executable, 'main.py', 'is_prime', '17'], capture_output=True, text=True, timeout=5)\n assert result.returncode == 0\n assert 'True' in result.stdout\n"
40+
}
41+
},
42+
{
43+
"name": "file_indexer",
44+
"prompt": "Create a Python file indexer in indexer.py. Must have functions: index_directory(path) that walks a directory recursively and returns a dict mapping filename -> {size, modified, lines} where lines is the number of lines in the file. Must use os.walk. search_by_name(index, pattern) that returns files matching a glob pattern (support * and ?). search_by_size(index, min_size, max_size) that filters files by size in bytes. search_duplicates(index) that finds files with the same size (potential duplicates). stats(index) returns total file count, total size, average file size. Provide main.py that accepts a directory path from sys.argv and prints a summary report.",
45+
"requirements": ["pytest"],
46+
"hidden_test_files": {
47+
"test_hidden_indexer.py": "import os\nimport sys\nimport tempfile\n\n\ndef setup_index(tmp_dir):\n from indexer import index_directory\n os.makedirs(tmp_dir + '/sub', exist_ok=True)\n with open(tmp_dir + '/a.txt', 'w') as f: f.write('hello\\nworld\\n')\n with open(tmp_dir + '/b.py', 'w') as f: f.write('def foo():\\n pass\\n')\n with open(tmp_dir + '/sub/c.txt', 'w') as f: f.write('test')\n return index_directory(tmp_dir)\n\n\ndef test_index_directory():\n with tempfile.TemporaryDirectory() as tmp:\n idx = setup_index(tmp)\n assert len(idx) == 3\n assert any('a.txt' in k for k in idx)\n assert any('b.py' in k for k in idx)\n\n\ndef test_search_by_name():\n from indexer import search_by_name\n with tempfile.TemporaryDirectory() as tmp:\n idx = setup_index(tmp)\n result = search_by_name(idx, '*.txt')\n assert len(result) >= 2\n\n\ndef test_search_by_size():\n from indexer import search_by_size\n with tempfile.TemporaryDirectory() as tmp:\n idx = setup_index(tmp)\n result = search_by_size(idx, 0, 100)\n assert len(result) >= 1\n\n\ndef test_stats():\n from indexer import stats\n with tempfile.TemporaryDirectory() as tmp:\n idx = setup_index(tmp)\n s = stats(idx)\n assert s['total_files'] >= 3\n assert s['total_size'] > 10\n assert s['avg_size'] > 0\n\n\ndef test_search_duplicates():\n from indexer import search_duplicates\n with tempfile.TemporaryDirectory() as tmp:\n os.makedirs(tmp + '/sub', exist_ok=True)\n with open(tmp + '/a.txt', 'w') as f: f.write('same content')\n with open(tmp + '/b.txt', 'w') as f: f.write('same content')\n with open(tmp + '/c.txt', 'w') as f: f.write('different')\n idx = {}\n for fn in os.listdir(tmp):\n fp = os.path.join(tmp, fn)\n if os.path.isfile(fp):\n idx[fn] = {'size': os.path.getsize(fp), 'modified': os.path.getmtime(fp), 'lines': 1}\n dups = search_duplicates(idx)\n assert len(dups) >= 1\n"
48+
}
49+
},
50+
{
51+
"name": "caching_layer",
52+
"prompt": "Create a Python caching library in cache.py. Implement a Cache class with: __init__(self, max_size=100, ttl_seconds=300) that sets max cache entries and default TTL. get(key) returns the value if key exists and is not expired, else None. set(key, value, ttl=None) stores a value with optional TTL override. delete(key) removes a key. clear() removes all entries. keys() returns list of valid (non-expired) keys. size property returns number of valid entries. Must be thread-safe using threading.Lock. Must track hit/miss counts and expose stats() returning a dict with hits, misses, hit_ratio, size, max_size. Provide main.py that demonstrates the cache with a simple CLI: python main.py set mykey myvalue, python main.py get mykey.",
53+
"requirements": ["pytest"],
54+
"hidden_test_files": {
55+
"test_hidden_cache.py": "import time\nimport sys\n\n\ndef test_cache_basic():\n from cache import Cache\n c = Cache(max_size=10, ttl_seconds=60)\n c.set('key1', 'value1')\n assert c.get('key1') == 'value1'\n assert c.get('nonexistent') is None\n\n\ndef test_cache_ttl():\n from cache import Cache\n c = Cache(max_size=10, ttl_seconds=1)\n c.set('key1', 'value1')\n assert c.get('key1') == 'value1'\n time.sleep(1.5)\n assert c.get('key1') is None\n\n\ndef test_cache_delete():\n from cache import Cache\n c = Cache()\n c.set('key1', 'value1')\n c.delete('key1')\n assert c.get('key1') is None\n\n\ndef test_cache_clear():\n from cache import Cache\n c = Cache()\n c.set('a', 1)\n c.set('b', 2)\n c.clear()\n assert c.size == 0\n\n\ndef test_cache_max_size():\n from cache import Cache\n c = Cache(max_size=2)\n c.set('a', 1)\n c.set('b', 2)\n c.set('c', 3)\n assert c.size <= 2\n\n\ndef test_cache_stats():\n from cache import Cache\n c = Cache()\n c.set('x', 1)\n c.get('x')\n c.get('y')\n s = c.stats()\n assert s['hits'] == 1\n assert s['misses'] == 1\n assert s['hit_ratio'] == 0.5\n\n\ndef test_cache_thread_safe():\n from cache import Cache\n c = Cache()\n import threading\n errors = []\n def worker():\n for i in range(100):\n try:\n c.set(str(i), i)\n c.get(str(i))\n except Exception as e:\n errors.append(e)\n threads = [threading.Thread(target=worker) for _ in range(4)]\n for t in threads: t.start()\n for t in threads: t.join()\n assert len(errors) == 0, f'thread safety errors: {errors}'\n\n\ndef test_main_set_get():\n result = subprocess.run([sys.executable, 'main.py', 'set', 'testkey', 'testval'], capture_output=True, text=True, timeout=5)\n result2 = subprocess.run([sys.executable, 'main.py', 'get', 'testkey'], capture_output=True, text=True, timeout=5)\n assert result.returncode == 0\n assert result2.returncode == 0\n"
56+
}
2557
}
2658
]

0 commit comments

Comments
 (0)