Document ensemble test generation workflow (#159) (#160)

that-github-user · unknown · claude · web-flow · commit ca6e68f6fc89 · 2026-03-29T23:31:24.000-07:00
* Add test collection script for ensemble test generation demo * Document two-phase ensemble workflow for test generation (#159) - Add "Recommended workflows" section to README with two-phase pattern: generate tests via ensemble first, then implement against converged tests - Include ensemble-generated test suite (test_pathfinding_generated.py) produced by 3/5 agents, all converging on correct maze path length (9) - Add test collection wrapper script for --collect-only validation Addresses #159 (Option A: documented workflow) Also references #31 (A* showcase evidence) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: unknown <that-github-user@github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/README.md b/README.md
@@ -124,6 +124,28 @@ The key insight: **parallel attempts cost more tokens but not more time.** All a
 - **Complex refactors** — many files, easy to miss something
 - **Unfamiliar codebases** — multiple attempts reduce the chance of going down the wrong path
 
+## Recommended workflows
+
+### Two-phase: generate tests, then implement
+
+A single agent can write a wrong test that becomes a false oracle. Use the ensemble to validate your test suite before using it to judge implementations.
+
+**Phase 1 — generate tests:**
+```bash
+thinktank run "write unit tests for grid.py pathfinding" -n 5 -t "bash run-collect-tests.sh"
+thinktank compare 1 2  # compare assertions across agents
+```
+
+If all agents assert the same expected values, the tests are likely correct. If they disagree on a specific assertion (e.g., 3 agents say path length 9, 1 says 13), investigate before proceeding.
+
+**Phase 2 — implement:**
+```bash
+thinktank apply           # apply the converged test suite
+thinktank run "implement A* pathfinding in grid.py" -n 5 -t "python -m pytest"
+```
+
+**Why this matters:** During development, a single agent wrote a test asserting a shortest path of 13 steps when the correct answer was 9. This wrong test caused 13+ ensemble runs to show 0% pass rate — every agent was right, but the oracle was wrong. Using ensemble test generation would have caught the bad assertion via convergence analysis before it became the ground truth.
+
 ## Commands
 
 ### `thinktank run [prompt]`
diff --git a/examples/astar-python/run-collect-tests.sh b/examples/astar-python/run-collect-tests.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+cd "$(dirname "$0")" && python -m pytest test_pathfinding_generated.py --collect-only
diff --git a/examples/astar-python/test_pathfinding_generated.py b/examples/astar-python/test_pathfinding_generated.py
@@ -0,0 +1,321 @@
+"""
+Comprehensive unit tests for A* pathfinding — generated test suite.
+
+Tests use unittest (not pytest fixtures).
+Import: find_path, Point from grid
+"""
+
+import time
+import unittest
+from grid import find_path, Point
+
+
+def is_valid_path(grid: list[list[int]], path: list[Point]) -> bool:
+    """Verify each step is adjacent (Manhattan distance 1) and on walkable terrain."""
+    for i, (r, c) in enumerate(path):
+        if r < 0 or r >= len(grid) or c < 0 or c >= len(grid[0]):
+            return False
+        if grid[r][c] != 0:
+            return False
+        if i > 0:
+            pr, pc = path[i - 1]
+            if abs(r - pr) + abs(c - pc) != 1:
+                return False
+    return True
+
+
+class TestStraightLine(unittest.TestCase):
+    """Test 1: Straight-line path with no obstacles."""
+
+    def test_horizontal_no_obstacles(self):
+        # Single row, walk right from col 0 to col 5 — path length must be 6
+        grid = [[0, 0, 0, 0, 0, 0]]
+        result = find_path(grid, (0, 0), (0, 5))
+        self.assertIsNotNone(result)
+        self.assertEqual(result.path[0], (0, 0))
+        self.assertEqual(result.path[-1], (0, 5))
+        self.assertEqual(len(result.path), 6)
+        self.assertTrue(is_valid_path(grid, result.path))
+
+    def test_vertical_no_obstacles(self):
+        # Single column, walk down from row 0 to row 4 — path length must be 5
+        grid = [[0]] * 5
+        result = find_path(grid, (0, 0), (4, 0))
+        self.assertIsNotNone(result)
+        self.assertEqual(result.path[0], (0, 0))
+        self.assertEqual(result.path[-1], (4, 0))
+        self.assertEqual(len(result.path), 5)
+        self.assertTrue(is_valid_path(grid, result.path))
+
+    def test_diagonal_corner_open_grid(self):
+        # Open 4x4 grid: Manhattan distance from (0,0) to (3,3) = 6, so path length = 7
+        grid = [
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+        ]
+        result = find_path(grid, (0, 0), (3, 3))
+        self.assertIsNotNone(result)
+        self.assertEqual(len(result.path), 7)
+        self.assertTrue(is_valid_path(grid, result.path))
+
+
+class TestAroundObstacles(unittest.TestCase):
+    """Test 2: Path must detour around walls."""
+
+    def test_single_wall_column(self):
+        # Wall at col 2 forces path around via col 0 bottom or top edge.
+        # Grid 3 rows x 5 cols, wall runs down col 2 (rows 0–1 only).
+        # Start (0,0) → End (0,4):
+        # Must go (0,0)→(1,0)→(2,0)→(2,1)→(2,2)→(2,3)→(2,4)→(1,4)→(0,4) = 9 nodes
+        grid = [
+            [0, 0, 1, 0, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 0, 0],
+        ]
+        result = find_path(grid, (0, 0), (0, 4))
+        self.assertIsNotNone(result)
+        self.assertEqual(result.path[0], (0, 0))
+        self.assertEqual(result.path[-1], (0, 4))
+        self.assertEqual(len(result.path), 9)
+        self.assertTrue(is_valid_path(grid, result.path))
+
+    def test_u_shaped_obstacle(self):
+        # U-shaped wall forces path around the outside.
+        # Grid 4x4, wall forms a U opening upward.
+        # Start (0,0) → End (3,3)
+        grid = [
+            [0, 0, 0, 0],
+            [0, 1, 1, 0],
+            [0, 1, 1, 0],
+            [0, 0, 0, 0],
+        ]
+        result = find_path(grid, (0, 0), (3, 3))
+        self.assertIsNotNone(result)
+        self.assertTrue(is_valid_path(grid, result.path))
+        # Shortest path must skirt the 2x2 block: Manhattan cost = 6, path = 7 nodes
+        self.assertEqual(len(result.path), 7)
+
+
+class TestUnreachable(unittest.TestCase):
+    """Test 3: Goal is completely walled off."""
+
+    def test_target_enclosed_by_walls(self):
+        # Target (1,1) surrounded on all sides by walls.
+        grid = [
+            [0, 0, 0, 0, 0],
+            [0, 1, 1, 1, 0],
+            [0, 1, 0, 1, 0],
+            [0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0],
+        ]
+        result = find_path(grid, (0, 0), (2, 2))
+        self.assertIsNone(result)
+
+    def test_source_enclosed_by_walls(self):
+        # Start (2,2) is the enclosed cell; target is open.
+        grid = [
+            [0, 0, 0, 0, 0],
+            [0, 1, 1, 1, 0],
+            [0, 1, 0, 1, 0],
+            [0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0],
+        ]
+        result = find_path(grid, (2, 2), (0, 0))
+        self.assertIsNone(result)
+
+    def test_full_wall_column_divides_grid(self):
+        # Full column wall separates left from right half.
+        grid = [
+            [0, 1, 0],
+            [0, 1, 0],
+            [0, 1, 0],
+        ]
+        result = find_path(grid, (0, 0), (2, 2))
+        self.assertIsNone(result)
+
+
+class TestStartEqualsEnd(unittest.TestCase):
+    """Test 4: Start and end are the same cell."""
+
+    def test_same_cell_returns_single_node_path(self):
+        grid = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
+        result = find_path(grid, (1, 1), (1, 1))
+        self.assertIsNotNone(result)
+        self.assertEqual(len(result.path), 1)
+        self.assertEqual(result.path[0], (1, 1))
+
+    def test_same_cell_top_left_corner(self):
+        grid = [[0, 0], [0, 0]]
+        result = find_path(grid, (0, 0), (0, 0))
+        self.assertIsNotNone(result)
+        self.assertEqual(len(result.path), 1)
+        self.assertEqual(result.path[0], (0, 0))
+
+    def test_same_cell_nodes_explored_minimal(self):
+        grid = [[0, 0], [0, 0]]
+        result = find_path(grid, (0, 0), (0, 0))
+        self.assertIsNotNone(result)
+        # Only the start cell needs to be examined
+        self.assertGreaterEqual(result.nodes_explored, 1)
+
+
+class TestMaze(unittest.TestCase):
+    """
+    Test 5: Maze with a single forced route.
+
+    Grid (5x5):
+        col:  0  1  2  3  4
+      row 0: [0, 0, 1, 0, 0]
+      row 1: [1, 0, 1, 0, 1]
+      row 2: [1, 0, 0, 0, 1]
+      row 3: [0, 0, 1, 0, 0]
+      row 4: [0, 1, 1, 1, 0]
+
+    Start: (0,0)   End: (4,4)
+
+    Hand-traced shortest path:
+      (0,0) → right  → (0,1)   [only move; down (1,0)=wall]
+      (0,1) → down   → (1,1)   [right (0,2)=wall]
+      (1,1) → down   → (2,1)   [left (1,0)=wall, right (1,2)=wall]
+      (2,1) → right  → (2,2)   [left (2,0)=wall; down (3,1) leads to dead-end pocket]
+      (2,2) → right  → (2,3)
+      (2,3) → down   → (3,3)   [right (2,4)=wall; up (1,3)→(0,3)→(0,4) is dead end]
+      (3,3) → right  → (3,4)   [down (4,3)=wall, left (3,2)=wall]
+      (3,4) → down   → (4,4)   ✓
+
+    Path: (0,0),(0,1),(1,1),(2,1),(2,2),(2,3),(3,3),(3,4),(4,4)
+    Length: 9 nodes
+    """
+
+    MAZE = [
+        [0, 0, 1, 0, 0],
+        [1, 0, 1, 0, 1],
+        [1, 0, 0, 0, 1],
+        [0, 0, 1, 0, 0],
+        [0, 1, 1, 1, 0],
+    ]
+
+    def test_maze_path_found(self):
+        result = find_path(self.MAZE, (0, 0), (4, 4))
+        self.assertIsNotNone(result)
+
+    def test_maze_endpoints(self):
+        result = find_path(self.MAZE, (0, 0), (4, 4))
+        self.assertIsNotNone(result)
+        self.assertEqual(result.path[0], (0, 0))
+        self.assertEqual(result.path[-1], (4, 4))
+
+    def test_maze_path_is_valid(self):
+        result = find_path(self.MAZE, (0, 0), (4, 4))
+        self.assertIsNotNone(result)
+        self.assertTrue(is_valid_path(self.MAZE, result.path))
+
+    def test_maze_shortest_path_length(self):
+        # As hand-traced above, shortest path visits exactly 9 nodes.
+        result = find_path(self.MAZE, (0, 0), (4, 4))
+        self.assertIsNotNone(result)
+        self.assertEqual(len(result.path), 9)
+
+    def test_maze_exact_route(self):
+        # There is only one path through this maze — verify the exact sequence.
+        expected = [
+            (0, 0), (0, 1),
+            (1, 1),
+            (2, 1), (2, 2), (2, 3),
+            (3, 3), (3, 4),
+            (4, 4),
+        ]
+        result = find_path(self.MAZE, (0, 0), (4, 4))
+        self.assertIsNotNone(result)
+        self.assertEqual(result.path, expected)
+
+
+class TestPerformance(unittest.TestCase):
+    """Test 6: 50x50 grid completes in under 1 second."""
+
+    def test_large_open_grid(self):
+        size = 50
+        grid = [[0] * size for _ in range(size)]
+
+        start_time = time.perf_counter()
+        result = find_path(grid, (0, 0), (size - 1, size - 1))
+        elapsed = time.perf_counter() - start_time
+
+        self.assertIsNotNone(result)
+        self.assertTrue(is_valid_path(grid, result.path))
+        self.assertLess(elapsed, 1.0, f"Took {elapsed:.3f}s, must be < 1s")
+
+    def test_large_grid_with_obstacle_channel(self):
+        # Wall runs down the middle except for a gap at the bottom,
+        # forcing path to travel the full height before crossing.
+        size = 50
+        mid = size // 2
+        grid = [[0] * size for _ in range(size)]
+        for r in range(0, size - 1):      # leave bottom row open
+            grid[r][mid] = 1
+
+        start_time = time.perf_counter()
+        result = find_path(grid, (0, 0), (0, size - 1))
+        elapsed = time.perf_counter() - start_time
+
+        self.assertIsNotNone(result)
+        self.assertTrue(is_valid_path(grid, result.path))
+        self.assertLess(elapsed, 1.0, f"Took {elapsed:.3f}s, must be < 1s")
+
+    def test_large_grid_path_length_optimal(self):
+        # On a fully open 50x50 grid, Manhattan optimal is (49+49)+1 = 99 nodes.
+        size = 50
+        grid = [[0] * size for _ in range(size)]
+        result = find_path(grid, (0, 0), (size - 1, size - 1))
+        self.assertIsNotNone(result)
+        self.assertEqual(len(result.path), 99)
+
+
+class TestNodesExplored(unittest.TestCase):
+    """Test 7: nodes_explored is reasonable — > 0 and < total grid cells."""
+
+    def test_nodes_explored_positive(self):
+        grid = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
+        result = find_path(grid, (0, 0), (2, 2))
+        self.assertIsNotNone(result)
+        self.assertGreater(result.nodes_explored, 0)
+
+    def test_nodes_explored_below_grid_size(self):
+        grid = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
+        result = find_path(grid, (0, 0), (2, 2))
+        self.assertIsNotNone(result)
+        total_cells = 3 * 3
+        self.assertLessEqual(result.nodes_explored, total_cells)
+
+    def test_nodes_explored_unreachable_covers_component(self):
+        # All cells left of the wall are explored when target is unreachable.
+        grid = [
+            [0, 1, 0],
+            [0, 1, 0],
+            [0, 1, 0],
+        ]
+        result = find_path(grid, (0, 0), (0, 2))
+        self.assertIsNone(result)
+        # find_path must return None (not raise); nodes_explored only available on success
+
+    def test_nodes_explored_large_grid_heuristic(self):
+        # A* with a good heuristic should explore well under the full grid.
+        size = 20
+        grid = [[0] * size for _ in range(size)]
+        result = find_path(grid, (0, 0), (size - 1, size - 1))
+        self.assertIsNotNone(result)
+        total_cells = size * size
+        self.assertGreater(result.nodes_explored, 0)
+        self.assertLess(result.nodes_explored, total_cells)
+
+    def test_nodes_explored_start_equals_end(self):
+        grid = [[0, 0], [0, 0]]
+        result = find_path(grid, (0, 0), (0, 0))
+        self.assertIsNotNone(result)
+        self.assertGreaterEqual(result.nodes_explored, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#!/bin/bash`
	`2`	`+cd "$(dirname "$0")" && python -m pytest test_pathfinding_generated.py --collect-only`