diff --git a/pdd/prompts/agentic_bug_orchestrator_python.prompt b/pdd/prompts/agentic_bug_orchestrator_python.prompt index dc695db4a..c268e7d1d 100644 --- a/pdd/prompts/agentic_bug_orchestrator_python.prompt +++ b/pdd/prompts/agentic_bug_orchestrator_python.prompt @@ -4,41 +4,63 @@ Write the `pdd/agentic_bug_orchestrator.py` module. % Role & Scope -Orchestrator for the 10-step agentic bug investigation workflow. Runs each step as a separate agentic task, accumulates context between steps, and tracks overall progress and cost. +Orchestrator for the 12-step agentic bug investigation workflow. Runs each step as a separate agentic task, accumulates context between steps, and tracks overall progress and cost. % Requirements 1. Function: `run_agentic_bug_orchestrator(issue_url: str, issue_content: str, repo_owner: str, repo_name: str, issue_number: int, issue_author: str, issue_title: str, *, cwd: Path, verbose: bool = False, quiet: bool = False) -> Tuple[bool, str, float, str, List[str]]` 2. Return 5-tuple: (success, final_message, total_cost, model_used, changed_files) -3. Run 10 steps sequentially, each as a separate `run_agentic_task()` call +3. Run 12 steps sequentially, each as a separate `run_agentic_task()` call 4. Accumulate step outputs to pass as context to subsequent steps 5. Track total cost across all steps -6. For Step 7: Parse agent output for `FILES_CREATED: path1, path2` or `FILES_MODIFIED: path1, path2` lines to extract changed files (used for hard stop check and final summary) -7. For Step 9: Parse agent output for `E2E_FILES_CREATED: path1, path2` lines to extract E2E test files, extend changed_files list -8. Pass extracted files to Step 9 (E2E) and Step 10 (PR) via `files_to_stage` context variable for explicit git staging +6. For Step 9: Parse agent output for `FILES_CREATED: path1, path2` or `FILES_MODIFIED: path1, path2` lines to extract changed files (used for hard stop check and final summary) +7. For Step 11: Parse agent output for `E2E_FILES_CREATED: path1, path2` lines to extract E2E test files, extend changed_files list +8. Pass extracted files to Step 11 (E2E) and Step 12 (PR) via `files_to_stage` context variable for explicit git staging % Step Execution -For each step (1-10): +For each step (1-12): 1. Load the step prompt template via `load_prompt_template(f"agentic_bug_step{n}_{name}_LLM")` 2. Format template with: issue_url, repo_owner, repo_name, issue_number, issue_content, issue_author, step1_output, step2_output, etc. -3. Call `run_agentic_task(formatted_prompt, cwd, ..., timeout=STEP_TIMEOUTS.get(step_num))` - - Import `STEP_TIMEOUTS` from `agentic_common` (Issue #261) +3. Call `run_agentic_task(formatted_prompt, cwd, ..., timeout=BUG_STEP_TIMEOUTS.get(step_num))` + - Define `BUG_STEP_TIMEOUTS: Dict[int, float]` as a module-level constant with integer keys 1-12 - Pass step-specific timeout to ensure complex steps get adequate time 4. Store the output for use by subsequent steps 5. Accumulate cost: `total_cost += step_cost` +% Per-Step Timeout Mapping +Define `BUG_STEP_TIMEOUTS` as a module-level dict with integer keys matching the 12-step `steps_config`: +```python +BUG_STEP_TIMEOUTS: Dict[int, float] = { + 1: 240.0, # Duplicate Check + 2: 400.0, # Docs Check + 3: 400.0, # Triage + 4: 500.0, # API Research + 5: 600.0, # Reproduce (Complex) + 6: 600.0, # Root Cause (Complex) + 7: 600.0, # Prompt Classification (may auto-fix prompts) + 8: 340.0, # Test Plan + 9: 1000.0, # Generate Unit Test (Most Complex) + 10: 600.0, # Verify Unit Test + 11: 2000.0, # E2E Test (Complex - needs to discover env & run tests) + 12: 240.0, # Create PR +} +``` +Keys MUST be integers (not floats like 5.5) and MUST match the step numbers in `steps_config`. + % Step Sequence | Step | Template Name | Purpose | |------|---------------|---------| | 1 | agentic_bug_step1_duplicate_LLM | Search for duplicate issues | | 2 | agentic_bug_step2_docs_LLM | Check documentation for user error | | 3 | agentic_bug_step3_triage_LLM | Assess if enough info to proceed | -| 4 | agentic_bug_step4_reproduce_LLM | Attempt to reproduce the bug | -| 5 | agentic_bug_step5_root_cause_LLM | Analyze root cause | -| 6 | agentic_bug_step6_test_plan_LLM | Design test strategy | -| 7 | agentic_bug_step7_generate_LLM | Generate failing unit test | -| 8 | agentic_bug_step8_verify_LLM | Verify test catches the bug | -| 9 | agentic_bug_step9_e2e_test_LLM | Generate and run E2E tests | -| 10 | agentic_bug_step10_pr_LLM | Create draft PR and link to issue | +| 4 | agentic_bug_step4_api_research_LLM | Research external API dependencies and constraints | +| 5 | agentic_bug_step5_reproduce_LLM | Attempt to reproduce the bug | +| 6 | agentic_bug_step6_root_cause_LLM | Analyze root cause | +| 7 | agentic_bug_step7_prompt_classification_LLM | Classify defect: code bug vs prompt defect | +| 8 | agentic_bug_step8_test_plan_LLM | Design test strategy | +| 9 | agentic_bug_step9_generate_LLM | Generate failing unit test | +| 10 | agentic_bug_step10_verify_LLM | Verify test catches the bug | +| 11 | agentic_bug_step11_e2e_test_LLM | Generate and run E2E tests | +| 12 | agentic_bug_step12_pr_LLM | Create draft PR and link to issue | % Context Accumulation - Step 1 receives: issue_content @@ -47,15 +69,17 @@ For each step (1-10): - Step 4 receives: issue_content, step1_output through step3_output - ... and so on - Step 8 receives: issue_content, step1_output through step7_output -- Step 9 receives: issue_content, step1_output through step8_output, worktree_path, files_to_stage -- Step 10 receives: issue_content, step1_output through step9_output, worktree_path, files_to_stage +- Step 9 receives: issue_content, step1_output through step8_output +- Step 10 receives: issue_content, step1_output through step9_output +- Step 11 receives: issue_content, step1_output through step10_output, worktree_path, files_to_stage +- Step 12 receives: issue_content, step1_output through step11_output, worktree_path, files_to_stage % Files to Stage -After Step 7 parses FILES_CREATED/FILES_MODIFIED, pass the extracted file list to Steps 9 and 10: +After Step 9 parses FILES_CREATED/FILES_MODIFIED, pass the extracted file list to Steps 11 and 12: - `context["files_to_stage"] = ", ".join(changed_files)` -- After Step 9 parses E2E_FILES_CREATED, extend changed_files and update files_to_stage -- This ensures Step 9 (E2E) and Step 10 (PR) know exactly which files to include -- Step 10 prompt uses `{files_to_stage}` to display the explicit list of files to stage +- After Step 11 parses E2E_FILES_CREATED, extend changed_files and update files_to_stage +- This ensures Step 11 (E2E) and Step 12 (PR) know exactly which files to include +- Step 12 prompt uses `{files_to_stage}` to display the explicit list of files to stage % Early Exit Conditions (Hard Stops) @@ -66,9 +90,10 @@ The orchestrator must parse step output to detect stop conditions: | 1 | Issue is a duplicate | Output contains "Duplicate of #" | | 2 | "Feature Request" or "User Error" | Output contains "Feature Request (Not a Bug)" or "User Error (Not a Bug)" | | 3 | Needs more info from author | Output contains "Needs More Info" | -| 7 | No test file generated | No FILES_CREATED or FILES_MODIFIED line in output, or empty file list | -| 8 | Test doesn't fail correctly | Output contains "FAIL: Test does not work as expected" | -| 9 | E2E test doesn't catch bug | Output contains "E2E_FAIL: Test does not catch bug correctly" | +| 7 | Prompt defect needs human review | Output contains "PROMPT_REVIEW:" | +| 9 | No test file generated | No FILES_CREATED or FILES_MODIFIED line in output, or empty file list | +| 10 | Test doesn't fail correctly | Output contains "FAIL: Test does not work as expected" | +| 11 | E2E test doesn't catch bug | Output contains "E2E_FAIL: Test does not catch bug correctly" | **Hard stop behavior:** - Return `(False, "Stopped at step N: {reason}", total_cost, model_used, changed_files)` @@ -82,7 +107,7 @@ The orchestrator must parse step output to detect stop conditions: % Git Worktree Isolation -Before Step 7, create an isolated git worktree for test generation and PR creation. This prevents the workflow from disturbing the user's current branch. +Before Step 7, create an isolated git worktree for prompt classification, test generation, and PR creation. This prevents the workflow from disturbing the user's current branch. 1. Helper function: `_setup_worktree(cwd: Path, issue_number: int, quiet: bool) -> Tuple[Optional[Path], Optional[str]]` - Create worktree at `.pdd/worktrees/fix-issue-{issue_number}/` relative to git root @@ -96,8 +121,8 @@ Before Step 7, create an isolated git worktree for test generation and PR creati 2. In orchestrator loop, before Step 7: - Call `_setup_worktree(cwd, issue_number, quiet)` - If worktree creation fails, return early with error: `(False, "Failed to create worktree: {error}", ...)` - - Switch working directory to worktree path for Steps 7-10 - - Add `worktree_path` to context dict (for Step 10 prompt formatting) + - Switch working directory to worktree path for Steps 7-12 + - Add `worktree_path` to context dict (for Step 12 prompt formatting) - Print: `"[blue]Working in worktree: {worktree_path}[/blue]"` (unless quiet) 3. Additional helper functions: @@ -118,7 +143,7 @@ The orchestrator must provide real-time feedback to the user via rich console ou 2. **Step progress** (before each step): ``` - [Step N/10] {step_description}... + [Step N/12] {step_description}... ``` 3. **Agentic output** (during each step): @@ -148,7 +173,7 @@ The orchestrator must provide real-time feedback to the user via rich console ou Use `quiet` flag to suppress all output except errors. Use `verbose` flag to show full agentic task output. % Dependencies -Import from `agentic_common`: `run_agentic_task`, `STEP_TIMEOUTS` +Import from `agentic_common`: `run_agentic_task` context/agentic_common_example.py context/load_prompt_template_example.py context/agentic_bug_example.py diff --git a/tests/test_agentic_bug_orchestrator.py b/tests/test_agentic_bug_orchestrator.py index 81af945c8..1408d8062 100644 --- a/tests/test_agentic_bug_orchestrator.py +++ b/tests/test_agentic_bug_orchestrator.py @@ -6363,3 +6363,245 @@ def mock_run_side_effect(*args, **kwargs): f"Expected warning about missing FIX_LOCATIONS marker. " f"Warning calls: {warning_calls}" ) + + +# ============================================================================= +# Issue #729: BUG_STEP_TIMEOUTS mismatch tests +# ============================================================================= +# These tests verify that BUG_STEP_TIMEOUTS correctly maps all 12 integer-keyed +# steps to the correct timeout values. They fail on the current buggy code because +# the dict uses the stale 5.5 numbering scheme (10 entries) instead of the correct +# 12-entry mapping (keys 1-12). + + +def test_issue729_all_12_steps_get_correct_timeout(mock_dependencies, default_args): + """Fresh run delivers correct per-step timeout for all 12 steps. + + The correct mapping is: + 1: 240, 2: 400, 3: 400, 4: 500, 5: 600, 6: 600, + 7: 600, 8: 340, 9: 1000, 10: 600, 11: 2000, 12: 240 + + Fails on buggy code: step 4 gets 600.0 instead of 500.0 (first mismatch). + """ + mock_run, _, _ = mock_dependencies + + # Expected correct timeout for each step (from the correct 12-step mapping) + correct_timeouts = { + 1: 240.0, 2: 400.0, 3: 400.0, 4: 500.0, 5: 600.0, 6: 600.0, + 7: 600.0, 8: 340.0, 9: 1000.0, 10: 600.0, 11: 2000.0, 12: 240.0, + } + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step9": + return (True, "gen\nFILES_CREATED: test.py", 0.1, "model") + return (True, "ok", 0.1, "model") + + mock_run.side_effect = side_effect + + run_agentic_bug_orchestrator(**default_args) + + # Build a map of step_num -> actual timeout from the mock calls + actual_timeouts = {} + for call_obj in mock_run.call_args_list: + label = call_obj.kwargs.get("label", "") + timeout = call_obj.kwargs.get("timeout") + step_str = label.replace("step", "").replace("_", ".") + step_num = float(step_str) if "." in step_str else int(step_str) + actual_timeouts[step_num] = timeout + + # Assert every step got the correct timeout + for step_num, expected_timeout in correct_timeouts.items(): + actual = actual_timeouts.get(step_num) + assert actual == expected_timeout, ( + f"Step {step_num}: expected timeout={expected_timeout}, got timeout={actual}" + ) + + +def test_issue729_resume_step6_gets_600s_timeout(mock_dependencies, default_args, tmp_path): + """Resume from step 5 should give step 6 (Root Cause) a 600s timeout. + + Fails on buggy code: step 6 gets 340.0 (Test Plan value from stale mapping). + """ + mock_run, _, _ = mock_dependencies + + # Set up saved state so orchestrator resumes from step 6 + state_dir = tmp_path / ".pdd" / "bug-state" + state_dir.mkdir(parents=True, exist_ok=True) + + import json + saved_state = { + "last_completed_step": 5, + "step_outputs": { + "1": "dup check ok", + "2": "docs ok", + "3": "triage ok", + "4": "api ok", + "5": "reproduce ok", + }, + "total_cost": 0.5, + "model_used": "model", + "worktree_path": str(tmp_path / ".pdd" / "worktrees" / "fix-issue-1"), + } + state_file = state_dir / "bug_state_1.json" + state_file.write_text(json.dumps(saved_state)) + + # Make load_workflow_state return the saved state + with patch("pdd.agentic_bug_orchestrator.load_workflow_state") as mock_load_state: + mock_load_state.return_value = (saved_state, None) + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step9": + return (True, "gen\nFILES_CREATED: test.py", 0.1, "model") + return (True, "ok", 0.1, "model") + + mock_run.side_effect = side_effect + + run_agentic_bug_orchestrator(**default_args) + + # Find the step 6 call and verify its timeout + step6_timeout = None + for call_obj in mock_run.call_args_list: + if call_obj.kwargs.get("label") == "step6": + step6_timeout = call_obj.kwargs.get("timeout") + break + + assert step6_timeout is not None, "step6 was never called" + assert step6_timeout == 600.0, ( + f"Step 6 (Root Cause) timeout: expected 600.0, got {step6_timeout}. " + "The stale BUG_STEP_TIMEOUTS mapping assigns 340.0 (Test Plan) to key 6." + ) + + +def test_issue729_fast_track_step6_gets_600s(mock_dependencies, default_args): + """Fast-track path (triage returns FAST_TRACK) should give step 6 600s timeout. + + When step 3 returns FAST_TRACK, steps 4-5 are skipped and step 6 runs next. + Fails on buggy code: step 6 gets 340.0 instead of 600.0. + """ + mock_run, _, _ = mock_dependencies + + call_count = [0] + + def side_effect(*args, **kwargs): + call_count[0] += 1 + label = kwargs.get("label", "") + if label == "step3": + return (True, "FAST_TRACK: Pre-diagnosed issue", 0.1, "model") + if label == "step9": + return (True, "gen\nFILES_CREATED: test.py", 0.1, "model") + return (True, "ok", 0.1, "model") + + mock_run.side_effect = side_effect + + run_agentic_bug_orchestrator(**default_args) + + # Verify steps 4 and 5 were skipped (not called via run_agentic_task) + called_labels = [c.kwargs.get("label") for c in mock_run.call_args_list] + assert "step4" not in called_labels, "step4 should be skipped in fast-track" + assert "step5" not in called_labels, "step5 should be skipped in fast-track" + + # Verify step 6 got the correct timeout + step6_timeout = None + for call_obj in mock_run.call_args_list: + if call_obj.kwargs.get("label") == "step6": + step6_timeout = call_obj.kwargs.get("timeout") + break + + assert step6_timeout is not None, "step6 was never called after fast-track" + assert step6_timeout == 600.0, ( + f"Step 6 (Root Cause) timeout after fast-track: expected 600.0, got {step6_timeout}. " + "Stale BUG_STEP_TIMEOUTS[6] returns 340.0 (Test Plan timeout)." + ) + + +def test_issue729_timeout_adder_uses_corrected_base(mock_dependencies, default_args): + """With timeout_adder=60, step 6 should get 660 (600+60), step 11 should get 2060. + + Fails on buggy code: step 6 gets 400.0 (340+60), step 11 gets 400.0 (default 340+60). + """ + mock_run, _, _ = mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step9": + return (True, "gen\nFILES_CREATED: test.py", 0.1, "model") + return (True, "ok", 0.1, "model") + + mock_run.side_effect = side_effect + + default_args["timeout_adder"] = 60.0 + run_agentic_bug_orchestrator(**default_args) + + # Build timeout map from calls + timeout_map = {} + for call_obj in mock_run.call_args_list: + label = call_obj.kwargs.get("label", "") + timeout = call_obj.kwargs.get("timeout") + step_str = label.replace("step", "").replace("_", ".") + step_num = float(step_str) if "." in step_str else int(step_str) + timeout_map[step_num] = timeout + + # Step 6 (Root Cause): base 600 + adder 60 = 660 + assert timeout_map.get(6) == 660.0, ( + f"Step 6 with adder: expected 660.0 (600+60), got {timeout_map.get(6)}. " + "Buggy code gives 400.0 (340+60) from stale mapping." + ) + + # Step 11 (E2E): base 2000 + adder 60 = 2060 + assert timeout_map.get(11) == 2060.0, ( + f"Step 11 with adder: expected 2060.0 (2000+60), got {timeout_map.get(11)}. " + "Buggy code gives 400.0 (default 340+60) since key 11 is missing." + ) + + # Step 12 (Create PR): base 240 + adder 60 = 300 + assert timeout_map.get(12) == 300.0, ( + f"Step 12 with adder: expected 300.0 (240+60), got {timeout_map.get(12)}. " + "Buggy code gives 400.0 (default 340+60) since key 12 is missing." + ) + + +def test_issue729_no_step_falls_through_to_default_timeout(mock_dependencies, default_args): + """Steps 11 and 12 must have explicit timeouts, not fall through to 340.0 default. + + Fails on buggy code: steps 11-12 have no keys in stale dict, get default 340.0. + """ + mock_run, _, _ = mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step9": + return (True, "gen\nFILES_CREATED: test.py", 0.1, "model") + return (True, "ok", 0.1, "model") + + mock_run.side_effect = side_effect + + run_agentic_bug_orchestrator(**default_args) + + # Collect timeouts for steps 10, 11, 12 — these are missing from the stale dict + timeout_map = {} + for call_obj in mock_run.call_args_list: + label = call_obj.kwargs.get("label", "") + timeout = call_obj.kwargs.get("timeout") + step_str = label.replace("step", "").replace("_", ".") + step_num = float(step_str) if "." in step_str else int(step_str) + timeout_map[step_num] = timeout + + # Step 11 (E2E) should get 2000, not 340 default + assert timeout_map.get(11) == 2000.0, ( + f"Step 11 (E2E): expected 2000.0, got {timeout_map.get(11)}. " + "Buggy code: key 11 missing from BUG_STEP_TIMEOUTS, falls through to 340.0 default." + ) + + # Step 12 (Create PR) should get 240, not 340 default + assert timeout_map.get(12) == 240.0, ( + f"Step 12 (Create PR): expected 240.0, got {timeout_map.get(12)}. " + "Buggy code: key 12 missing from BUG_STEP_TIMEOUTS, falls through to 340.0 default." + ) + + # Step 10 (Verify) should get 600, not the buggy mapping + assert timeout_map.get(10) == 600.0, ( + f"Step 10 (Verify Unit Test): expected 600.0, got {timeout_map.get(10)}. " + "Buggy code: key 10 maps to 240.0 (Create PR timeout from stale mapping)." + )