Skip to content

Commit ca6e68f

Browse files
that-github-userunknownclaude
authored
Document ensemble test generation workflow (#159) (#160)
* Add test collection script for ensemble test generation demo * Document two-phase ensemble workflow for test generation (#159) - Add "Recommended workflows" section to README with two-phase pattern: generate tests via ensemble first, then implement against converged tests - Include ensemble-generated test suite (test_pathfinding_generated.py) produced by 3/5 agents, all converging on correct maze path length (9) - Add test collection wrapper script for --collect-only validation Addresses #159 (Option A: documented workflow) Also references #31 (A* showcase evidence) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: unknown <that-github-user@github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 1278f3d commit ca6e68f

3 files changed

Lines changed: 345 additions & 0 deletions

File tree

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,28 @@ The key insight: **parallel attempts cost more tokens but not more time.** All a
124124
- **Complex refactors** — many files, easy to miss something
125125
- **Unfamiliar codebases** — multiple attempts reduce the chance of going down the wrong path
126126

127+
## Recommended workflows
128+
129+
### Two-phase: generate tests, then implement
130+
131+
A single agent can write a wrong test that becomes a false oracle. Use the ensemble to validate your test suite before using it to judge implementations.
132+
133+
**Phase 1 — generate tests:**
134+
```bash
135+
thinktank run "write unit tests for grid.py pathfinding" -n 5 -t "bash run-collect-tests.sh"
136+
thinktank compare 1 2 # compare assertions across agents
137+
```
138+
139+
If all agents assert the same expected values, the tests are likely correct. If they disagree on a specific assertion (e.g., 3 agents say path length 9, 1 says 13), investigate before proceeding.
140+
141+
**Phase 2 — implement:**
142+
```bash
143+
thinktank apply # apply the converged test suite
144+
thinktank run "implement A* pathfinding in grid.py" -n 5 -t "python -m pytest"
145+
```
146+
147+
**Why this matters:** During development, a single agent wrote a test asserting a shortest path of 13 steps when the correct answer was 9. This wrong test caused 13+ ensemble runs to show 0% pass rate — every agent was right, but the oracle was wrong. Using ensemble test generation would have caught the bad assertion via convergence analysis before it became the ground truth.
148+
127149
## Commands
128150

129151
### `thinktank run [prompt]`
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/bash
2+
cd "$(dirname "$0")" && python -m pytest test_pathfinding_generated.py --collect-only
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
"""
2+
Comprehensive unit tests for A* pathfinding — generated test suite.
3+
4+
Tests use unittest (not pytest fixtures).
5+
Import: find_path, Point from grid
6+
"""
7+
8+
import time
9+
import unittest
10+
from grid import find_path, Point
11+
12+
13+
def is_valid_path(grid: list[list[int]], path: list[Point]) -> bool:
14+
"""Verify each step is adjacent (Manhattan distance 1) and on walkable terrain."""
15+
for i, (r, c) in enumerate(path):
16+
if r < 0 or r >= len(grid) or c < 0 or c >= len(grid[0]):
17+
return False
18+
if grid[r][c] != 0:
19+
return False
20+
if i > 0:
21+
pr, pc = path[i - 1]
22+
if abs(r - pr) + abs(c - pc) != 1:
23+
return False
24+
return True
25+
26+
27+
class TestStraightLine(unittest.TestCase):
28+
"""Test 1: Straight-line path with no obstacles."""
29+
30+
def test_horizontal_no_obstacles(self):
31+
# Single row, walk right from col 0 to col 5 — path length must be 6
32+
grid = [[0, 0, 0, 0, 0, 0]]
33+
result = find_path(grid, (0, 0), (0, 5))
34+
self.assertIsNotNone(result)
35+
self.assertEqual(result.path[0], (0, 0))
36+
self.assertEqual(result.path[-1], (0, 5))
37+
self.assertEqual(len(result.path), 6)
38+
self.assertTrue(is_valid_path(grid, result.path))
39+
40+
def test_vertical_no_obstacles(self):
41+
# Single column, walk down from row 0 to row 4 — path length must be 5
42+
grid = [[0]] * 5
43+
result = find_path(grid, (0, 0), (4, 0))
44+
self.assertIsNotNone(result)
45+
self.assertEqual(result.path[0], (0, 0))
46+
self.assertEqual(result.path[-1], (4, 0))
47+
self.assertEqual(len(result.path), 5)
48+
self.assertTrue(is_valid_path(grid, result.path))
49+
50+
def test_diagonal_corner_open_grid(self):
51+
# Open 4x4 grid: Manhattan distance from (0,0) to (3,3) = 6, so path length = 7
52+
grid = [
53+
[0, 0, 0, 0],
54+
[0, 0, 0, 0],
55+
[0, 0, 0, 0],
56+
[0, 0, 0, 0],
57+
]
58+
result = find_path(grid, (0, 0), (3, 3))
59+
self.assertIsNotNone(result)
60+
self.assertEqual(len(result.path), 7)
61+
self.assertTrue(is_valid_path(grid, result.path))
62+
63+
64+
class TestAroundObstacles(unittest.TestCase):
65+
"""Test 2: Path must detour around walls."""
66+
67+
def test_single_wall_column(self):
68+
# Wall at col 2 forces path around via col 0 bottom or top edge.
69+
# Grid 3 rows x 5 cols, wall runs down col 2 (rows 0–1 only).
70+
# Start (0,0) → End (0,4):
71+
# Must go (0,0)→(1,0)→(2,0)→(2,1)→(2,2)→(2,3)→(2,4)→(1,4)→(0,4) = 9 nodes
72+
grid = [
73+
[0, 0, 1, 0, 0],
74+
[0, 0, 1, 0, 0],
75+
[0, 0, 0, 0, 0],
76+
]
77+
result = find_path(grid, (0, 0), (0, 4))
78+
self.assertIsNotNone(result)
79+
self.assertEqual(result.path[0], (0, 0))
80+
self.assertEqual(result.path[-1], (0, 4))
81+
self.assertEqual(len(result.path), 9)
82+
self.assertTrue(is_valid_path(grid, result.path))
83+
84+
def test_u_shaped_obstacle(self):
85+
# U-shaped wall forces path around the outside.
86+
# Grid 4x4, wall forms a U opening upward.
87+
# Start (0,0) → End (3,3)
88+
grid = [
89+
[0, 0, 0, 0],
90+
[0, 1, 1, 0],
91+
[0, 1, 1, 0],
92+
[0, 0, 0, 0],
93+
]
94+
result = find_path(grid, (0, 0), (3, 3))
95+
self.assertIsNotNone(result)
96+
self.assertTrue(is_valid_path(grid, result.path))
97+
# Shortest path must skirt the 2x2 block: Manhattan cost = 6, path = 7 nodes
98+
self.assertEqual(len(result.path), 7)
99+
100+
101+
class TestUnreachable(unittest.TestCase):
102+
"""Test 3: Goal is completely walled off."""
103+
104+
def test_target_enclosed_by_walls(self):
105+
# Target (1,1) surrounded on all sides by walls.
106+
grid = [
107+
[0, 0, 0, 0, 0],
108+
[0, 1, 1, 1, 0],
109+
[0, 1, 0, 1, 0],
110+
[0, 1, 1, 1, 0],
111+
[0, 0, 0, 0, 0],
112+
]
113+
result = find_path(grid, (0, 0), (2, 2))
114+
self.assertIsNone(result)
115+
116+
def test_source_enclosed_by_walls(self):
117+
# Start (2,2) is the enclosed cell; target is open.
118+
grid = [
119+
[0, 0, 0, 0, 0],
120+
[0, 1, 1, 1, 0],
121+
[0, 1, 0, 1, 0],
122+
[0, 1, 1, 1, 0],
123+
[0, 0, 0, 0, 0],
124+
]
125+
result = find_path(grid, (2, 2), (0, 0))
126+
self.assertIsNone(result)
127+
128+
def test_full_wall_column_divides_grid(self):
129+
# Full column wall separates left from right half.
130+
grid = [
131+
[0, 1, 0],
132+
[0, 1, 0],
133+
[0, 1, 0],
134+
]
135+
result = find_path(grid, (0, 0), (2, 2))
136+
self.assertIsNone(result)
137+
138+
139+
class TestStartEqualsEnd(unittest.TestCase):
140+
"""Test 4: Start and end are the same cell."""
141+
142+
def test_same_cell_returns_single_node_path(self):
143+
grid = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
144+
result = find_path(grid, (1, 1), (1, 1))
145+
self.assertIsNotNone(result)
146+
self.assertEqual(len(result.path), 1)
147+
self.assertEqual(result.path[0], (1, 1))
148+
149+
def test_same_cell_top_left_corner(self):
150+
grid = [[0, 0], [0, 0]]
151+
result = find_path(grid, (0, 0), (0, 0))
152+
self.assertIsNotNone(result)
153+
self.assertEqual(len(result.path), 1)
154+
self.assertEqual(result.path[0], (0, 0))
155+
156+
def test_same_cell_nodes_explored_minimal(self):
157+
grid = [[0, 0], [0, 0]]
158+
result = find_path(grid, (0, 0), (0, 0))
159+
self.assertIsNotNone(result)
160+
# Only the start cell needs to be examined
161+
self.assertGreaterEqual(result.nodes_explored, 1)
162+
163+
164+
class TestMaze(unittest.TestCase):
165+
"""
166+
Test 5: Maze with a single forced route.
167+
168+
Grid (5x5):
169+
col: 0 1 2 3 4
170+
row 0: [0, 0, 1, 0, 0]
171+
row 1: [1, 0, 1, 0, 1]
172+
row 2: [1, 0, 0, 0, 1]
173+
row 3: [0, 0, 1, 0, 0]
174+
row 4: [0, 1, 1, 1, 0]
175+
176+
Start: (0,0) End: (4,4)
177+
178+
Hand-traced shortest path:
179+
(0,0) → right → (0,1) [only move; down (1,0)=wall]
180+
(0,1) → down → (1,1) [right (0,2)=wall]
181+
(1,1) → down → (2,1) [left (1,0)=wall, right (1,2)=wall]
182+
(2,1) → right → (2,2) [left (2,0)=wall; down (3,1) leads to dead-end pocket]
183+
(2,2) → right → (2,3)
184+
(2,3) → down → (3,3) [right (2,4)=wall; up (1,3)→(0,3)→(0,4) is dead end]
185+
(3,3) → right → (3,4) [down (4,3)=wall, left (3,2)=wall]
186+
(3,4) → down → (4,4) ✓
187+
188+
Path: (0,0),(0,1),(1,1),(2,1),(2,2),(2,3),(3,3),(3,4),(4,4)
189+
Length: 9 nodes
190+
"""
191+
192+
MAZE = [
193+
[0, 0, 1, 0, 0],
194+
[1, 0, 1, 0, 1],
195+
[1, 0, 0, 0, 1],
196+
[0, 0, 1, 0, 0],
197+
[0, 1, 1, 1, 0],
198+
]
199+
200+
def test_maze_path_found(self):
201+
result = find_path(self.MAZE, (0, 0), (4, 4))
202+
self.assertIsNotNone(result)
203+
204+
def test_maze_endpoints(self):
205+
result = find_path(self.MAZE, (0, 0), (4, 4))
206+
self.assertIsNotNone(result)
207+
self.assertEqual(result.path[0], (0, 0))
208+
self.assertEqual(result.path[-1], (4, 4))
209+
210+
def test_maze_path_is_valid(self):
211+
result = find_path(self.MAZE, (0, 0), (4, 4))
212+
self.assertIsNotNone(result)
213+
self.assertTrue(is_valid_path(self.MAZE, result.path))
214+
215+
def test_maze_shortest_path_length(self):
216+
# As hand-traced above, shortest path visits exactly 9 nodes.
217+
result = find_path(self.MAZE, (0, 0), (4, 4))
218+
self.assertIsNotNone(result)
219+
self.assertEqual(len(result.path), 9)
220+
221+
def test_maze_exact_route(self):
222+
# There is only one path through this maze — verify the exact sequence.
223+
expected = [
224+
(0, 0), (0, 1),
225+
(1, 1),
226+
(2, 1), (2, 2), (2, 3),
227+
(3, 3), (3, 4),
228+
(4, 4),
229+
]
230+
result = find_path(self.MAZE, (0, 0), (4, 4))
231+
self.assertIsNotNone(result)
232+
self.assertEqual(result.path, expected)
233+
234+
235+
class TestPerformance(unittest.TestCase):
236+
"""Test 6: 50x50 grid completes in under 1 second."""
237+
238+
def test_large_open_grid(self):
239+
size = 50
240+
grid = [[0] * size for _ in range(size)]
241+
242+
start_time = time.perf_counter()
243+
result = find_path(grid, (0, 0), (size - 1, size - 1))
244+
elapsed = time.perf_counter() - start_time
245+
246+
self.assertIsNotNone(result)
247+
self.assertTrue(is_valid_path(grid, result.path))
248+
self.assertLess(elapsed, 1.0, f"Took {elapsed:.3f}s, must be < 1s")
249+
250+
def test_large_grid_with_obstacle_channel(self):
251+
# Wall runs down the middle except for a gap at the bottom,
252+
# forcing path to travel the full height before crossing.
253+
size = 50
254+
mid = size // 2
255+
grid = [[0] * size for _ in range(size)]
256+
for r in range(0, size - 1): # leave bottom row open
257+
grid[r][mid] = 1
258+
259+
start_time = time.perf_counter()
260+
result = find_path(grid, (0, 0), (0, size - 1))
261+
elapsed = time.perf_counter() - start_time
262+
263+
self.assertIsNotNone(result)
264+
self.assertTrue(is_valid_path(grid, result.path))
265+
self.assertLess(elapsed, 1.0, f"Took {elapsed:.3f}s, must be < 1s")
266+
267+
def test_large_grid_path_length_optimal(self):
268+
# On a fully open 50x50 grid, Manhattan optimal is (49+49)+1 = 99 nodes.
269+
size = 50
270+
grid = [[0] * size for _ in range(size)]
271+
result = find_path(grid, (0, 0), (size - 1, size - 1))
272+
self.assertIsNotNone(result)
273+
self.assertEqual(len(result.path), 99)
274+
275+
276+
class TestNodesExplored(unittest.TestCase):
277+
"""Test 7: nodes_explored is reasonable — > 0 and < total grid cells."""
278+
279+
def test_nodes_explored_positive(self):
280+
grid = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
281+
result = find_path(grid, (0, 0), (2, 2))
282+
self.assertIsNotNone(result)
283+
self.assertGreater(result.nodes_explored, 0)
284+
285+
def test_nodes_explored_below_grid_size(self):
286+
grid = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
287+
result = find_path(grid, (0, 0), (2, 2))
288+
self.assertIsNotNone(result)
289+
total_cells = 3 * 3
290+
self.assertLessEqual(result.nodes_explored, total_cells)
291+
292+
def test_nodes_explored_unreachable_covers_component(self):
293+
# All cells left of the wall are explored when target is unreachable.
294+
grid = [
295+
[0, 1, 0],
296+
[0, 1, 0],
297+
[0, 1, 0],
298+
]
299+
result = find_path(grid, (0, 0), (0, 2))
300+
self.assertIsNone(result)
301+
# find_path must return None (not raise); nodes_explored only available on success
302+
303+
def test_nodes_explored_large_grid_heuristic(self):
304+
# A* with a good heuristic should explore well under the full grid.
305+
size = 20
306+
grid = [[0] * size for _ in range(size)]
307+
result = find_path(grid, (0, 0), (size - 1, size - 1))
308+
self.assertIsNotNone(result)
309+
total_cells = size * size
310+
self.assertGreater(result.nodes_explored, 0)
311+
self.assertLess(result.nodes_explored, total_cells)
312+
313+
def test_nodes_explored_start_equals_end(self):
314+
grid = [[0, 0], [0, 0]]
315+
result = find_path(grid, (0, 0), (0, 0))
316+
self.assertIsNotNone(result)
317+
self.assertGreaterEqual(result.nodes_explored, 1)
318+
319+
320+
if __name__ == "__main__":
321+
unittest.main()

0 commit comments

Comments
 (0)