diff --git a/pdd/prompts/agentic_bug_orchestrator_python.prompt b/pdd/prompts/agentic_bug_orchestrator_python.prompt
index dc695db4a..c268e7d1d 100644
--- a/pdd/prompts/agentic_bug_orchestrator_python.prompt
+++ b/pdd/prompts/agentic_bug_orchestrator_python.prompt
@@ -4,41 +4,63 @@
Write the `pdd/agentic_bug_orchestrator.py` module.
% Role & Scope
-Orchestrator for the 10-step agentic bug investigation workflow. Runs each step as a separate agentic task, accumulates context between steps, and tracks overall progress and cost.
+Orchestrator for the 12-step agentic bug investigation workflow. Runs each step as a separate agentic task, accumulates context between steps, and tracks overall progress and cost.
% Requirements
1. Function: `run_agentic_bug_orchestrator(issue_url: str, issue_content: str, repo_owner: str, repo_name: str, issue_number: int, issue_author: str, issue_title: str, *, cwd: Path, verbose: bool = False, quiet: bool = False) -> Tuple[bool, str, float, str, List[str]]`
2. Return 5-tuple: (success, final_message, total_cost, model_used, changed_files)
-3. Run 10 steps sequentially, each as a separate `run_agentic_task()` call
+3. Run 12 steps sequentially, each as a separate `run_agentic_task()` call
4. Accumulate step outputs to pass as context to subsequent steps
5. Track total cost across all steps
-6. For Step 7: Parse agent output for `FILES_CREATED: path1, path2` or `FILES_MODIFIED: path1, path2` lines to extract changed files (used for hard stop check and final summary)
-7. For Step 9: Parse agent output for `E2E_FILES_CREATED: path1, path2` lines to extract E2E test files, extend changed_files list
-8. Pass extracted files to Step 9 (E2E) and Step 10 (PR) via `files_to_stage` context variable for explicit git staging
+6. For Step 9: Parse agent output for `FILES_CREATED: path1, path2` or `FILES_MODIFIED: path1, path2` lines to extract changed files (used for hard stop check and final summary)
+7. For Step 11: Parse agent output for `E2E_FILES_CREATED: path1, path2` lines to extract E2E test files, extend changed_files list
+8. Pass extracted files to Step 11 (E2E) and Step 12 (PR) via `files_to_stage` context variable for explicit git staging
% Step Execution
-For each step (1-10):
+For each step (1-12):
1. Load the step prompt template via `load_prompt_template(f"agentic_bug_step{n}_{name}_LLM")`
2. Format template with: issue_url, repo_owner, repo_name, issue_number, issue_content, issue_author, step1_output, step2_output, etc.
-3. Call `run_agentic_task(formatted_prompt, cwd, ..., timeout=STEP_TIMEOUTS.get(step_num))`
- - Import `STEP_TIMEOUTS` from `agentic_common` (Issue #261)
+3. Call `run_agentic_task(formatted_prompt, cwd, ..., timeout=BUG_STEP_TIMEOUTS.get(step_num))`
+ - Define `BUG_STEP_TIMEOUTS: Dict[int, float]` as a module-level constant with integer keys 1-12
- Pass step-specific timeout to ensure complex steps get adequate time
4. Store the output for use by subsequent steps
5. Accumulate cost: `total_cost += step_cost`
+% Per-Step Timeout Mapping
+Define `BUG_STEP_TIMEOUTS` as a module-level dict with integer keys matching the 12-step `steps_config`:
+```python
+BUG_STEP_TIMEOUTS: Dict[int, float] = {
+ 1: 240.0, # Duplicate Check
+ 2: 400.0, # Docs Check
+ 3: 400.0, # Triage
+ 4: 500.0, # API Research
+ 5: 600.0, # Reproduce (Complex)
+ 6: 600.0, # Root Cause (Complex)
+ 7: 600.0, # Prompt Classification (may auto-fix prompts)
+ 8: 340.0, # Test Plan
+ 9: 1000.0, # Generate Unit Test (Most Complex)
+ 10: 600.0, # Verify Unit Test
+ 11: 2000.0, # E2E Test (Complex - needs to discover env & run tests)
+ 12: 240.0, # Create PR
+}
+```
+Keys MUST be integers (not floats like 5.5) and MUST match the step numbers in `steps_config`.
+
% Step Sequence
| Step | Template Name | Purpose |
|------|---------------|---------|
| 1 | agentic_bug_step1_duplicate_LLM | Search for duplicate issues |
| 2 | agentic_bug_step2_docs_LLM | Check documentation for user error |
| 3 | agentic_bug_step3_triage_LLM | Assess if enough info to proceed |
-| 4 | agentic_bug_step4_reproduce_LLM | Attempt to reproduce the bug |
-| 5 | agentic_bug_step5_root_cause_LLM | Analyze root cause |
-| 6 | agentic_bug_step6_test_plan_LLM | Design test strategy |
-| 7 | agentic_bug_step7_generate_LLM | Generate failing unit test |
-| 8 | agentic_bug_step8_verify_LLM | Verify test catches the bug |
-| 9 | agentic_bug_step9_e2e_test_LLM | Generate and run E2E tests |
-| 10 | agentic_bug_step10_pr_LLM | Create draft PR and link to issue |
+| 4 | agentic_bug_step4_api_research_LLM | Research external API dependencies and constraints |
+| 5 | agentic_bug_step5_reproduce_LLM | Attempt to reproduce the bug |
+| 6 | agentic_bug_step6_root_cause_LLM | Analyze root cause |
+| 7 | agentic_bug_step7_prompt_classification_LLM | Classify defect: code bug vs prompt defect |
+| 8 | agentic_bug_step8_test_plan_LLM | Design test strategy |
+| 9 | agentic_bug_step9_generate_LLM | Generate failing unit test |
+| 10 | agentic_bug_step10_verify_LLM | Verify test catches the bug |
+| 11 | agentic_bug_step11_e2e_test_LLM | Generate and run E2E tests |
+| 12 | agentic_bug_step12_pr_LLM | Create draft PR and link to issue |
% Context Accumulation
- Step 1 receives: issue_content
@@ -47,15 +69,17 @@ For each step (1-10):
- Step 4 receives: issue_content, step1_output through step3_output
- ... and so on
- Step 8 receives: issue_content, step1_output through step7_output
-- Step 9 receives: issue_content, step1_output through step8_output, worktree_path, files_to_stage
-- Step 10 receives: issue_content, step1_output through step9_output, worktree_path, files_to_stage
+- Step 9 receives: issue_content, step1_output through step8_output
+- Step 10 receives: issue_content, step1_output through step9_output
+- Step 11 receives: issue_content, step1_output through step10_output, worktree_path, files_to_stage
+- Step 12 receives: issue_content, step1_output through step11_output, worktree_path, files_to_stage
% Files to Stage
-After Step 7 parses FILES_CREATED/FILES_MODIFIED, pass the extracted file list to Steps 9 and 10:
+After Step 9 parses FILES_CREATED/FILES_MODIFIED, pass the extracted file list to Steps 11 and 12:
- `context["files_to_stage"] = ", ".join(changed_files)`
-- After Step 9 parses E2E_FILES_CREATED, extend changed_files and update files_to_stage
-- This ensures Step 9 (E2E) and Step 10 (PR) know exactly which files to include
-- Step 10 prompt uses `{files_to_stage}` to display the explicit list of files to stage
+- After Step 11 parses E2E_FILES_CREATED, extend changed_files and update files_to_stage
+- This ensures Step 11 (E2E) and Step 12 (PR) know exactly which files to include
+- Step 12 prompt uses `{files_to_stage}` to display the explicit list of files to stage
% Early Exit Conditions (Hard Stops)
@@ -66,9 +90,10 @@ The orchestrator must parse step output to detect stop conditions:
| 1 | Issue is a duplicate | Output contains "Duplicate of #" |
| 2 | "Feature Request" or "User Error" | Output contains "Feature Request (Not a Bug)" or "User Error (Not a Bug)" |
| 3 | Needs more info from author | Output contains "Needs More Info" |
-| 7 | No test file generated | No FILES_CREATED or FILES_MODIFIED line in output, or empty file list |
-| 8 | Test doesn't fail correctly | Output contains "FAIL: Test does not work as expected" |
-| 9 | E2E test doesn't catch bug | Output contains "E2E_FAIL: Test does not catch bug correctly" |
+| 7 | Prompt defect needs human review | Output contains "PROMPT_REVIEW:" |
+| 9 | No test file generated | No FILES_CREATED or FILES_MODIFIED line in output, or empty file list |
+| 10 | Test doesn't fail correctly | Output contains "FAIL: Test does not work as expected" |
+| 11 | E2E test doesn't catch bug | Output contains "E2E_FAIL: Test does not catch bug correctly" |
**Hard stop behavior:**
- Return `(False, "Stopped at step N: {reason}", total_cost, model_used, changed_files)`
@@ -82,7 +107,7 @@ The orchestrator must parse step output to detect stop conditions:
% Git Worktree Isolation
-Before Step 7, create an isolated git worktree for test generation and PR creation. This prevents the workflow from disturbing the user's current branch.
+Before Step 7, create an isolated git worktree for prompt classification, test generation, and PR creation. This prevents the workflow from disturbing the user's current branch.
1. Helper function: `_setup_worktree(cwd: Path, issue_number: int, quiet: bool) -> Tuple[Optional[Path], Optional[str]]`
- Create worktree at `.pdd/worktrees/fix-issue-{issue_number}/` relative to git root
@@ -96,8 +121,8 @@ Before Step 7, create an isolated git worktree for test generation and PR creati
2. In orchestrator loop, before Step 7:
- Call `_setup_worktree(cwd, issue_number, quiet)`
- If worktree creation fails, return early with error: `(False, "Failed to create worktree: {error}", ...)`
- - Switch working directory to worktree path for Steps 7-10
- - Add `worktree_path` to context dict (for Step 10 prompt formatting)
+ - Switch working directory to worktree path for Steps 7-12
+ - Add `worktree_path` to context dict (for Step 12 prompt formatting)
- Print: `"[blue]Working in worktree: {worktree_path}[/blue]"` (unless quiet)
3. Additional helper functions:
@@ -118,7 +143,7 @@ The orchestrator must provide real-time feedback to the user via rich console ou
2. **Step progress** (before each step):
```
- [Step N/10] {step_description}...
+ [Step N/12] {step_description}...
```
3. **Agentic output** (during each step):
@@ -148,7 +173,7 @@ The orchestrator must provide real-time feedback to the user via rich console ou
Use `quiet` flag to suppress all output except errors. Use `verbose` flag to show full agentic task output.
% Dependencies
-Import from `agentic_common`: `run_agentic_task`, `STEP_TIMEOUTS`
+Import from `agentic_common`: `run_agentic_task`
context/agentic_common_example.py
context/load_prompt_template_example.py
context/agentic_bug_example.py
diff --git a/tests/test_agentic_bug_orchestrator.py b/tests/test_agentic_bug_orchestrator.py
index 81af945c8..1408d8062 100644
--- a/tests/test_agentic_bug_orchestrator.py
+++ b/tests/test_agentic_bug_orchestrator.py
@@ -6363,3 +6363,245 @@ def mock_run_side_effect(*args, **kwargs):
f"Expected warning about missing FIX_LOCATIONS marker. "
f"Warning calls: {warning_calls}"
)
+
+
+# =============================================================================
+# Issue #729: BUG_STEP_TIMEOUTS mismatch tests
+# =============================================================================
+# These tests verify that BUG_STEP_TIMEOUTS correctly maps all 12 integer-keyed
+# steps to the correct timeout values. They fail on the current buggy code because
+# the dict uses the stale 5.5 numbering scheme (10 entries) instead of the correct
+# 12-entry mapping (keys 1-12).
+
+
+def test_issue729_all_12_steps_get_correct_timeout(mock_dependencies, default_args):
+ """Fresh run delivers correct per-step timeout for all 12 steps.
+
+ The correct mapping is:
+ 1: 240, 2: 400, 3: 400, 4: 500, 5: 600, 6: 600,
+ 7: 600, 8: 340, 9: 1000, 10: 600, 11: 2000, 12: 240
+
+ Fails on buggy code: step 4 gets 600.0 instead of 500.0 (first mismatch).
+ """
+ mock_run, _, _ = mock_dependencies
+
+ # Expected correct timeout for each step (from the correct 12-step mapping)
+ correct_timeouts = {
+ 1: 240.0, 2: 400.0, 3: 400.0, 4: 500.0, 5: 600.0, 6: 600.0,
+ 7: 600.0, 8: 340.0, 9: 1000.0, 10: 600.0, 11: 2000.0, 12: 240.0,
+ }
+
+ def side_effect(*args, **kwargs):
+ label = kwargs.get("label", "")
+ if label == "step9":
+ return (True, "gen\nFILES_CREATED: test.py", 0.1, "model")
+ return (True, "ok", 0.1, "model")
+
+ mock_run.side_effect = side_effect
+
+ run_agentic_bug_orchestrator(**default_args)
+
+ # Build a map of step_num -> actual timeout from the mock calls
+ actual_timeouts = {}
+ for call_obj in mock_run.call_args_list:
+ label = call_obj.kwargs.get("label", "")
+ timeout = call_obj.kwargs.get("timeout")
+ step_str = label.replace("step", "").replace("_", ".")
+ step_num = float(step_str) if "." in step_str else int(step_str)
+ actual_timeouts[step_num] = timeout
+
+ # Assert every step got the correct timeout
+ for step_num, expected_timeout in correct_timeouts.items():
+ actual = actual_timeouts.get(step_num)
+ assert actual == expected_timeout, (
+ f"Step {step_num}: expected timeout={expected_timeout}, got timeout={actual}"
+ )
+
+
+def test_issue729_resume_step6_gets_600s_timeout(mock_dependencies, default_args, tmp_path):
+ """Resume from step 5 should give step 6 (Root Cause) a 600s timeout.
+
+ Fails on buggy code: step 6 gets 340.0 (Test Plan value from stale mapping).
+ """
+ mock_run, _, _ = mock_dependencies
+
+ # Set up saved state so orchestrator resumes from step 6
+ state_dir = tmp_path / ".pdd" / "bug-state"
+ state_dir.mkdir(parents=True, exist_ok=True)
+
+ import json
+ saved_state = {
+ "last_completed_step": 5,
+ "step_outputs": {
+ "1": "dup check ok",
+ "2": "docs ok",
+ "3": "triage ok",
+ "4": "api ok",
+ "5": "reproduce ok",
+ },
+ "total_cost": 0.5,
+ "model_used": "model",
+ "worktree_path": str(tmp_path / ".pdd" / "worktrees" / "fix-issue-1"),
+ }
+ state_file = state_dir / "bug_state_1.json"
+ state_file.write_text(json.dumps(saved_state))
+
+ # Make load_workflow_state return the saved state
+ with patch("pdd.agentic_bug_orchestrator.load_workflow_state") as mock_load_state:
+ mock_load_state.return_value = (saved_state, None)
+
+ def side_effect(*args, **kwargs):
+ label = kwargs.get("label", "")
+ if label == "step9":
+ return (True, "gen\nFILES_CREATED: test.py", 0.1, "model")
+ return (True, "ok", 0.1, "model")
+
+ mock_run.side_effect = side_effect
+
+ run_agentic_bug_orchestrator(**default_args)
+
+ # Find the step 6 call and verify its timeout
+ step6_timeout = None
+ for call_obj in mock_run.call_args_list:
+ if call_obj.kwargs.get("label") == "step6":
+ step6_timeout = call_obj.kwargs.get("timeout")
+ break
+
+ assert step6_timeout is not None, "step6 was never called"
+ assert step6_timeout == 600.0, (
+ f"Step 6 (Root Cause) timeout: expected 600.0, got {step6_timeout}. "
+ "The stale BUG_STEP_TIMEOUTS mapping assigns 340.0 (Test Plan) to key 6."
+ )
+
+
+def test_issue729_fast_track_step6_gets_600s(mock_dependencies, default_args):
+ """Fast-track path (triage returns FAST_TRACK) should give step 6 600s timeout.
+
+ When step 3 returns FAST_TRACK, steps 4-5 are skipped and step 6 runs next.
+ Fails on buggy code: step 6 gets 340.0 instead of 600.0.
+ """
+ mock_run, _, _ = mock_dependencies
+
+ call_count = [0]
+
+ def side_effect(*args, **kwargs):
+ call_count[0] += 1
+ label = kwargs.get("label", "")
+ if label == "step3":
+ return (True, "FAST_TRACK: Pre-diagnosed issue", 0.1, "model")
+ if label == "step9":
+ return (True, "gen\nFILES_CREATED: test.py", 0.1, "model")
+ return (True, "ok", 0.1, "model")
+
+ mock_run.side_effect = side_effect
+
+ run_agentic_bug_orchestrator(**default_args)
+
+ # Verify steps 4 and 5 were skipped (not called via run_agentic_task)
+ called_labels = [c.kwargs.get("label") for c in mock_run.call_args_list]
+ assert "step4" not in called_labels, "step4 should be skipped in fast-track"
+ assert "step5" not in called_labels, "step5 should be skipped in fast-track"
+
+ # Verify step 6 got the correct timeout
+ step6_timeout = None
+ for call_obj in mock_run.call_args_list:
+ if call_obj.kwargs.get("label") == "step6":
+ step6_timeout = call_obj.kwargs.get("timeout")
+ break
+
+ assert step6_timeout is not None, "step6 was never called after fast-track"
+ assert step6_timeout == 600.0, (
+ f"Step 6 (Root Cause) timeout after fast-track: expected 600.0, got {step6_timeout}. "
+ "Stale BUG_STEP_TIMEOUTS[6] returns 340.0 (Test Plan timeout)."
+ )
+
+
+def test_issue729_timeout_adder_uses_corrected_base(mock_dependencies, default_args):
+ """With timeout_adder=60, step 6 should get 660 (600+60), step 11 should get 2060.
+
+ Fails on buggy code: step 6 gets 400.0 (340+60), step 11 gets 400.0 (default 340+60).
+ """
+ mock_run, _, _ = mock_dependencies
+
+ def side_effect(*args, **kwargs):
+ label = kwargs.get("label", "")
+ if label == "step9":
+ return (True, "gen\nFILES_CREATED: test.py", 0.1, "model")
+ return (True, "ok", 0.1, "model")
+
+ mock_run.side_effect = side_effect
+
+ default_args["timeout_adder"] = 60.0
+ run_agentic_bug_orchestrator(**default_args)
+
+ # Build timeout map from calls
+ timeout_map = {}
+ for call_obj in mock_run.call_args_list:
+ label = call_obj.kwargs.get("label", "")
+ timeout = call_obj.kwargs.get("timeout")
+ step_str = label.replace("step", "").replace("_", ".")
+ step_num = float(step_str) if "." in step_str else int(step_str)
+ timeout_map[step_num] = timeout
+
+ # Step 6 (Root Cause): base 600 + adder 60 = 660
+ assert timeout_map.get(6) == 660.0, (
+ f"Step 6 with adder: expected 660.0 (600+60), got {timeout_map.get(6)}. "
+ "Buggy code gives 400.0 (340+60) from stale mapping."
+ )
+
+ # Step 11 (E2E): base 2000 + adder 60 = 2060
+ assert timeout_map.get(11) == 2060.0, (
+ f"Step 11 with adder: expected 2060.0 (2000+60), got {timeout_map.get(11)}. "
+ "Buggy code gives 400.0 (default 340+60) since key 11 is missing."
+ )
+
+ # Step 12 (Create PR): base 240 + adder 60 = 300
+ assert timeout_map.get(12) == 300.0, (
+ f"Step 12 with adder: expected 300.0 (240+60), got {timeout_map.get(12)}. "
+ "Buggy code gives 400.0 (default 340+60) since key 12 is missing."
+ )
+
+
+def test_issue729_no_step_falls_through_to_default_timeout(mock_dependencies, default_args):
+ """Steps 11 and 12 must have explicit timeouts, not fall through to 340.0 default.
+
+ Fails on buggy code: steps 11-12 have no keys in stale dict, get default 340.0.
+ """
+ mock_run, _, _ = mock_dependencies
+
+ def side_effect(*args, **kwargs):
+ label = kwargs.get("label", "")
+ if label == "step9":
+ return (True, "gen\nFILES_CREATED: test.py", 0.1, "model")
+ return (True, "ok", 0.1, "model")
+
+ mock_run.side_effect = side_effect
+
+ run_agentic_bug_orchestrator(**default_args)
+
+ # Collect timeouts for steps 10, 11, 12 — these are missing from the stale dict
+ timeout_map = {}
+ for call_obj in mock_run.call_args_list:
+ label = call_obj.kwargs.get("label", "")
+ timeout = call_obj.kwargs.get("timeout")
+ step_str = label.replace("step", "").replace("_", ".")
+ step_num = float(step_str) if "." in step_str else int(step_str)
+ timeout_map[step_num] = timeout
+
+ # Step 11 (E2E) should get 2000, not 340 default
+ assert timeout_map.get(11) == 2000.0, (
+ f"Step 11 (E2E): expected 2000.0, got {timeout_map.get(11)}. "
+ "Buggy code: key 11 missing from BUG_STEP_TIMEOUTS, falls through to 340.0 default."
+ )
+
+ # Step 12 (Create PR) should get 240, not 340 default
+ assert timeout_map.get(12) == 240.0, (
+ f"Step 12 (Create PR): expected 240.0, got {timeout_map.get(12)}. "
+ "Buggy code: key 12 missing from BUG_STEP_TIMEOUTS, falls through to 340.0 default."
+ )
+
+ # Step 10 (Verify) should get 600, not the buggy mapping
+ assert timeout_map.get(10) == 600.0, (
+ f"Step 10 (Verify Unit Test): expected 600.0, got {timeout_map.get(10)}. "
+ "Buggy code: key 10 maps to 240.0 (Create PR timeout from stale mapping)."
+ )