diff --git a/architecture.json b/architecture.json index 7a8307dc4..5b0c826b9 100644 --- a/architecture.json +++ b/architecture.json @@ -80,6 +80,21 @@ "signature": "(text: Optional[str]) -> Optional[str]", "returns": "Optional[str]" }, + { + "name": "extract_step_report", + "signature": "(text: Optional[str]) -> Optional[str]", + "returns": "Optional[str]" + }, + { + "name": "normalize_step_comments_state", + "signature": "(raw: Any) -> Set[int]", + "returns": "Set[int]" + }, + { + "name": "post_step_comment_once", + "signature": "(*, repo_owner: str, repo_name: str, issue_number: int, step_num: int, body: str, posted_steps: Set[int], cwd: Path) -> bool", + "returns": "bool" + }, { "name": "_sanitize_comment_body", "signature": "(body: Optional[str], max_chars: int = 25_000) -> str", diff --git a/pdd/agentic_change_orchestrator.py b/pdd/agentic_change_orchestrator.py index b627f8205..620e02fba 100644 --- a/pdd/agentic_change_orchestrator.py +++ b/pdd/agentic_change_orchestrator.py @@ -29,6 +29,9 @@ post_step_comment, set_agentic_progress, clear_agentic_progress, + extract_step_report, + normalize_step_comments_state, + post_step_comment_once, ) from pdd.load_prompt_template import load_prompt_template from pdd.sync_order import ( @@ -1417,7 +1420,10 @@ def run_agentic_change_orchestrator( model_used = "unknown" github_comment_id = None worktree_path = None - + + step_comments_set: Set[int] = normalize_step_comments_state(state.get("step_comments")) + state["step_comments"] = sorted(step_comments_set) + pddrc_context = _load_pddrc_context(cwd) context = { @@ -1694,6 +1700,7 @@ def run_agentic_change_orchestrator( output=step_output, cwd=cwd, ) state["step_outputs"][str(step_num)] = f"FAILED: {step_output}" + state["step_comments"] = sorted(step_comments_set) save_workflow_state(cwd, issue_number, "change", state, state_dir, repo_owner, repo_name, use_github_state, github_comment_id) return False, f"Aborting: {consecutive_provider_failures} consecutive steps failed — agent providers unavailable", total_cost, model_used, [] else: @@ -1717,6 +1724,7 @@ def run_agentic_change_orchestrator( refreshed = _fetch_issue_updated_at(repo_owner, repo_name, issue_number) if refreshed: state["issue_updated_at"] = refreshed + state["step_comments"] = sorted(step_comments_set) save_workflow_state(cwd, issue_number, "change", state, state_dir, repo_owner, repo_name, use_github_state, github_comment_id) return False, f"Stopped at step {step_num}: {stop_reason}", total_cost, model_used, changed_files console.print(f"[yellow]Warning: Step {step_num} reported failure but continuing...[/yellow]") @@ -1739,6 +1747,7 @@ def run_agentic_change_orchestrator( refreshed = _fetch_issue_updated_at(repo_owner, repo_name, issue_number) if refreshed: state["issue_updated_at"] = refreshed + state["step_comments"] = sorted(step_comments_set) save_workflow_state(cwd, issue_number, "change", state, state_dir, repo_owner, repo_name, use_github_state, github_comment_id) return False, f"Stopped at step {step_num}: {stop_reason}", total_cost, model_used, changed_files @@ -1771,6 +1780,7 @@ def run_agentic_change_orchestrator( # Issue #467: Mark as FAILED instead of using step_num - 1 state["step_outputs"][str(step_num)] = f"FAILED: {step_output}" # Don't advance last_completed_step — keep it at its current value + state["step_comments"] = sorted(step_comments_set) save_workflow_state(cwd, issue_number, "change", state, state_dir, repo_owner, repo_name, use_github_state, github_comment_id) return False, "Stopped at step 9: Implementation produced no file changes", total_cost, model_used, [] @@ -1925,6 +1935,28 @@ def run_agentic_change_orchestrator( consecutive_provider_failures = 0 state["step_outputs"][str(step_num)] = step_output state["last_completed_step"] = step_num + try: + report_body = extract_step_report(step_output) + if not report_body: + report_body = ( + f"_Step {step_num} completed; no `` block " + "returned by agent. Raw output retained in workflow state._" + ) + comment_body = ( + f"## Step {step_num}/13: {description}\n\n{report_body}" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=step_num, + body=comment_body, + posted_steps=step_comments_set, + cwd=current_work_dir, + ) + except Exception as exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed: {exc}[/yellow]") + state["step_comments"] = sorted(step_comments_set) else: state["step_outputs"][str(step_num)] = f"FAILED: {step_output}" @@ -1970,6 +2002,33 @@ def run_agentic_change_orchestrator( instruction=s11_prompt, cwd=current_work_dir, verbose=verbose, quiet=quiet, timeout=timeout11, label=f"step11_iter{review_iteration}", max_retries=DEFAULT_MAX_RETRIES, reasoning_time=reasoning_time, ) total_cost += s11_cost; model_used = s11_model; state["total_cost"] = total_cost + # Round-2 of Greg's review: gate the trusted Step 11 post on + # s11_success. A failed task (e.g. provider exhaustion) still + # returns step output, and posting a "completed" fallback would + # both mislead the user and mark this iteration's composite key + # (review_iteration*100 + 11) as already-posted in + # state["step_comments"]. On a later resume/retry the real + # successful Step 11 report would be silently deduped away. + if s11_success: + try: + s11_report = extract_step_report(s11_output) + if not s11_report: + s11_report = ( + "_Step 11 completed; no `` block returned " + "by agent. Raw output retained in workflow state._" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=review_iteration * 100 + 11, + body=f"## Step 11/13: Review (iteration {review_iteration})\n\n{s11_report}", + posted_steps=step_comments_set, + cwd=current_work_dir, + ) + state["step_comments"] = sorted(step_comments_set) + except Exception as exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed: {exc}[/yellow]") if _review_loop_no_issues(s11_output): if not quiet: console.print(" -> No issues found. Proceeding to PR.") context["step11_output"] = s11_output; break @@ -1989,6 +2048,30 @@ def run_agentic_change_orchestrator( total_cost += s12_cost; model_used = s12_model; state["total_cost"] = total_cost previous_fixes += f"\n\nIteration {review_iteration}:\n{s12_output}" state["previous_fixes"] = previous_fixes + # Round-2 of Greg's review: gate the trusted Step 12 post on + # s12_success. Same reasoning as Step 11 above — a failed fix + # task must not burn the composite key, otherwise a later + # successful retry of the same iteration would be deduped. + if s12_success: + try: + s12_report = extract_step_report(s12_output) + if not s12_report: + s12_report = ( + "_Step 12 completed; no `` block returned " + "by agent. Raw output retained in workflow state._" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=review_iteration * 100 + 12, + body=f"## Step 12/13: Fix (iteration {review_iteration})\n\n{s12_report}", + posted_steps=step_comments_set, + cwd=current_work_dir, + ) + state["step_comments"] = sorted(step_comments_set) + except Exception as exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed: {exc}[/yellow]") save_result = save_workflow_state(cwd, issue_number, "change", state, state_dir, repo_owner, repo_name, use_github_state, github_comment_id) if save_result: github_comment_id = save_result; state["github_comment_id"] = github_comment_id if review_iteration >= MAX_REVIEW_ITERATIONS: @@ -2097,8 +2180,28 @@ def run_agentic_change_orchestrator( if not s13_success: post_step_comment(repo_owner, repo_name, issue_number, 13, 13, "Create PR and link to issue", s13_output, cwd) console.print("[red]Step 13 (PR Creation) failed.[/red]") + state["step_comments"] = sorted(step_comments_set) save_workflow_state(cwd, issue_number, "change", state, state_dir, repo_owner, repo_name, use_github_state, github_comment_id) return False, "PR Creation failed", total_cost, model_used, changed_files + try: + s13_report = extract_step_report(s13_output) + if not s13_report: + s13_report = ( + "_Step 13 completed; no `` block returned " + "by agent. Raw output retained in workflow state._" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=13, + body=f"## Step 13/13: Create PR and link to issue\n\n{s13_report}", + posted_steps=step_comments_set, + cwd=current_work_dir, + ) + state["step_comments"] = sorted(step_comments_set) + except Exception as exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed: {exc}[/yellow]") pr_url = "Unknown"; url_match = re.search(r"https://github.com/\S+/pull/\d+", s13_output) if url_match: pr_url = url_match.group(0) if not quiet: diff --git a/pdd/agentic_checkup_orchestrator.py b/pdd/agentic_checkup_orchestrator.py index 6d502c658..1d6a017c9 100644 --- a/pdd/agentic_checkup_orchestrator.py +++ b/pdd/agentic_checkup_orchestrator.py @@ -18,7 +18,7 @@ import shutil import subprocess from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple, Union from rich.console import Console @@ -26,9 +26,12 @@ DEFAULT_MAX_RETRIES, _sanitize_comment_body, clear_workflow_state, + extract_step_report, load_workflow_state, + normalize_step_comments_state, post_pr_comment, post_step_comment, + post_step_comment_once, run_agentic_task, save_workflow_state, substitute_template_variables, @@ -1029,6 +1032,11 @@ def run_agentic_checkup_orchestrator( if changed_files: context["files_to_stage"] = ", ".join(changed_files) + if state is None: + step_comments_set: Set[int] = set() + else: + step_comments_set = normalize_step_comments_state(state.get("step_comments")) + # Step definitions (step 6 split into 6.1/6.2/6.3 sub-steps). steps: List[Tuple[Union[int, float], str, str]] = [ (1, "discover", "Discovering project structure and tech stack"), @@ -1069,6 +1077,7 @@ def _save_state() -> None: pr_owner=pr_owner, pr_repo=pr_repo, pr_head_sha=current_pr_head_sha if pr_mode else None, + step_comments=sorted(step_comments_set), ) github_comment_id = save_workflow_state( cwd=cwd, issue_number=issue_number, workflow_type="checkup", @@ -1078,6 +1087,43 @@ def _save_state() -> None: github_comment_id=github_comment_id, ) + def _step_comment_key(step_num: Union[int, float], iteration: int = 1) -> int: + """Project (step_num, iteration) -> deterministic non-negative int. + + Encoding ``iteration * 10000 + int(round(step_num * 10))`` handles + fractional steps (6.1 -> 61) and the iterated fix-verify loop. + """ + return iteration * 10000 + int(round(float(step_num) * 10)) + + def _maybe_post_step_comment( + step_num: Union[int, float], + description: str, + step_output: str, + iteration: int = 1, + ) -> None: + """Post a trusted per-step success comment; log-and-continue on error.""" + try: + report_body = extract_step_report(step_output) + if not report_body: + report_body = ( + f"_Step {step_num} completed; no `` block " + "returned by agent. Raw output retained in workflow state._" + ) + comment_body = ( + f"## Step {step_num}/{TOTAL_STEPS}: {description}\n\n{report_body}" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=_step_comment_key(step_num, iteration), + body=comment_body, + posted_steps=step_comments_set, + cwd=current_cwd, + ) + except Exception as exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed: {exc}[/yellow]") + def _is_provider_failure(output: str) -> bool: return "All agent providers failed" in output @@ -1087,6 +1133,8 @@ def _handle_step_result( output: str, cost: float, model: str, + description: str = "", + iteration: int = 1, ) -> Optional[Tuple[bool, str, float, str]]: """Process a step result — update context, save state. @@ -1123,6 +1171,8 @@ def _handle_step_result( step_outputs[step_key] = output last_completed_step_to_save = step_num consecutive_provider_failures = 0 + if description: + _maybe_post_step_comment(step_num, description, output, iteration) else: step_outputs[step_key] = f"FAILED: {output}" if _is_provider_failure(output): @@ -1352,7 +1402,7 @@ def _post_pr_mode_final_report(final_step7_output: str) -> str: success, output, cost, model = result - abort = _handle_step_result(step_num, success, output, cost, model) + abort = _handle_step_result(step_num, success, output, cost, model, description=description) if abort is not None: return abort @@ -1395,7 +1445,7 @@ def _post_pr_mode_final_report(final_step7_output: str) -> str: return (False, f"Missing prompt template: {template_name}", total_cost, last_model_used) success, output, cost, model = result - abort = _handle_step_result(step_num, success, output, cost, model) + abort = _handle_step_result(step_num, success, output, cost, model, description=description) if abort is not None: return abort if not success and _is_provider_failure(output): @@ -1444,7 +1494,7 @@ def _post_pr_mode_final_report(final_step7_output: str) -> str: success, output, cost, model = result nofix_step7_output = output - abort = _handle_step_result(7, success, output, cost, model) + abort = _handle_step_result(7, success, output, cost, model, description=desc7) if abort is not None: return abort else: @@ -1585,7 +1635,10 @@ def _post_pr_mode_final_report(final_step7_output: str) -> str: return (False, f"Missing prompt template: {tmpl_name}", total_cost, last_model_used) success, output, cost, model = result - abort = _handle_step_result(step_num, success, output, cost, model) + abort = _handle_step_result( + step_num, success, output, cost, model, + description=description, iteration=fix_verify_iteration, + ) if abort is not None: return abort @@ -1854,7 +1907,7 @@ def _post_pr_mode_final_report(final_step7_output: str) -> str: return (False, f"Missing prompt template: {template_name}", total_cost, last_model_used) success, output, cost, model = result - abort = _handle_step_result(8, success, output, cost, model) + abort = _handle_step_result(8, success, output, cost, model, description=desc8) if abort is not None: return abort @@ -1903,6 +1956,7 @@ def _build_state( pr_owner: Optional[str] = None, pr_repo: Optional[str] = None, pr_head_sha: Optional[str] = None, + step_comments: Optional[List[int]] = None, ) -> Dict: """Build a serialisable state dict for persistence. @@ -1938,4 +1992,5 @@ def _build_state( "pr_owner": pr_owner, "pr_repo": pr_repo, "pr_head_sha": pr_head_sha, + "step_comments": list(step_comments) if step_comments else [], } diff --git a/pdd/agentic_common.py b/pdd/agentic_common.py index 315553a01..673452cbc 100644 --- a/pdd/agentic_common.py +++ b/pdd/agentic_common.py @@ -14,7 +14,7 @@ import random from datetime import datetime from pathlib import Path -from typing import List, Optional, Tuple, Dict, Any, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union from dataclasses import dataclass from rich.console import Console @@ -3503,6 +3503,130 @@ def _extract_step_report(text: Optional[str]) -> Optional[str]: return matches[-1].strip() +# Public alias for orchestrator callers; same semantics as ``_extract_step_report``. +extract_step_report = _extract_step_report + + +def normalize_step_comments_state(raw: Any) -> Set[int]: + """Coerce a persisted ``state["step_comments"]`` value into ``Set[int]``. + + Accepts every shape that has ever been persisted: + + * ``None`` / missing key -> empty set. + * ``list`` / ``tuple`` of ints -> set of those ints. + * ``set`` / ``frozenset`` -> defensive copy of int members. + * Legacy bug-orchestrator dict, e.g. + ``{"1": {"posted": True}, "2": {"failed_posted": True}}`` + -> set of int step numbers whose ``posted`` *or* ``failed_posted`` flag + is truthy. Pending shapes (``fallback_pending`` / ``failed_pending``) + are skipped so the orchestrator retries them on resume. + * Malformed or unexpected inputs -> empty set (never raises). + + ``bool`` is rejected even though it's an ``int`` subclass — the legacy + dict shape uses booleans as flag values, not step numbers. ``float`` is + rejected too — orchestrators with fractional steps must project them + through a composite-key helper before storing. + """ + if raw is None: + return set() + if isinstance(raw, dict): + out: Set[int] = set() + for key, value in raw.items(): + try: + step = int(key) + except (TypeError, ValueError): + continue + if isinstance(value, dict): + if value.get("posted") is True or value.get("failed_posted") is True: + out.add(step) + elif value is True: + out.add(step) + return out + if isinstance(raw, (list, tuple, set, frozenset)): + out = set() + for item in raw: + if isinstance(item, bool): + continue + if isinstance(item, int): + out.add(item) + continue + if isinstance(item, str): + try: + out.add(int(item)) + except ValueError: + continue + return out + return set() + + +def post_step_comment_once( + *, + repo_owner: str, + repo_name: str, + issue_number: int, + step_num: int, + body: str, + posted_steps: Set[int], + cwd: Path, +) -> bool: + """Post a per-step success comment via ``gh issue comment``, exactly once. + + Idempotent: if ``step_num`` is already in ``posted_steps``, returns + ``True`` immediately without invoking ``gh``. On a successful new post + it mutates ``posted_steps`` in place (adds ``step_num``). + + Args: + repo_owner: GitHub owner / org slug. + repo_name: GitHub repository name. + issue_number: Target issue number. + step_num: Integer key that uniquely identifies this step within the + workflow. Orchestrators with fractional or iterated steps must + project their (step_num, iteration) tuples to a deterministic + int before calling — the helper is ``Set[int]``-typed. + body: Pre-built comment body. The helper applies its own redaction + and truncation pass before shelling out; callers don't need to + sanitize first. + posted_steps: In-memory ``Set[int]`` of step keys already posted. + Mutated in place on a successful new post. + cwd: Working directory for the ``gh`` subprocess. + + Returns: + ``True`` on success-or-already-posted, ``False`` on missing CLI or + ``gh`` non-zero exit. Never raises; transient failures are logged + via ``console.print`` and surfaced as ``False``. + """ + if step_num in posted_steps: + return True + if not _find_cli_binary("gh"): + return False + final_body = _sanitize_comment_body(body) + try: + result = subprocess.run( + [ + "gh", "issue", "comment", str(issue_number), + "--repo", f"{repo_owner}/{repo_name}", + "--body", final_body, + ], + cwd=cwd, + capture_output=True, + text=True, + ) + if result.returncode != 0: + console.print( + f"[yellow]Warning: Failed to post step comment for step {step_num}: " + f"{result.stderr}[/yellow]" + ) + return False + except Exception as exc: # pylint: disable=broad-except + console.print( + f"[yellow]Warning: Failed to post step comment for step {step_num}: " + f"{exc}[/yellow]" + ) + return False + posted_steps.add(step_num) + return True + + def _sanitize_comment_body( body: Optional[str], max_chars: int = _COMMENT_MAX_CHARS, diff --git a/pdd/agentic_e2e_fix_orchestrator.py b/pdd/agentic_e2e_fix_orchestrator.py index 39ba6a157..843344208 100644 --- a/pdd/agentic_e2e_fix_orchestrator.py +++ b/pdd/agentic_e2e_fix_orchestrator.py @@ -30,6 +30,9 @@ GITHUB_STATE_MARKER_START, GITHUB_STATE_MARKER_END, _revert_out_of_scope_changes, + extract_step_report, + normalize_step_comments_state, + post_step_comment_once, ) from .get_test_command import get_test_command_for_file from .load_prompt_template import load_prompt_template @@ -1960,6 +1963,73 @@ def _run_step11_code_cleanup( return total_cost, changed_files +def _build_post_loop_state( + *, + workflow_name: str, + issue_url: str, + issue_number: int, + current_cycle: int, + last_completed_step: int, + step_outputs: Dict[str, str], + dev_unit_states: Dict[str, Any], + skipped_steps: Dict[int, str], + total_cost: float, + model_used: str, + changed_files: List[str], + github_comment_id: Optional[int], + step_comments_set: Set[int], + initial_file_hashes: Optional[Dict[str, Optional[str]]], + initial_sha: Optional[str], +) -> Dict[str, Any]: + """Build a workflow-state dict from function-scope locals only. + + The orchestrator's main ``state_data`` dict is first assigned inside the + inner ``for step_num in range(1, 10)`` loop. The + SUCCESS_FALL_THROUGH / resume-terminal-success resume paths skip that + loop entirely (Step 9 already declared success on the prior run, so + control flows directly into the post-loop cleanup + CI sites). On + those paths, any reference to the inner-loop ``state_data`` raises + ``UnboundLocalError`` and the broad ``try/except`` around the + post-loop Step 10/11 trusted-comment saves swallows it as a + ``post_step_comment_once failed`` warning, leaving the composite key + unpersisted. A later resume would then re-post the same Step 10/11 + visible comment. + + This helper assembles the same shape as the inner-loop save using + only locals that are unconditionally initialized at function entry + (``step_outputs``, ``step_comments_set``, ``changed_files``, etc.) so + the post-loop saves work identically on the linear-execution path, + the SUCCESS_FALL_THROUGH resume path, and the + resume-terminal-success path. + + Snapshot fields (``initial_file_hashes``, ``initial_sha``) accept + ``None`` so callers can pass through the resume-time restored values + without normalizing first. + """ + return { + "workflow": workflow_name, + "issue_url": issue_url, + "issue_number": issue_number, + "current_cycle": current_cycle, + "last_completed_step": last_completed_step, + "step_outputs": dict(step_outputs), + "dev_unit_states": dict(dev_unit_states), + "skipped_steps": {str(k): v for k, v in skipped_steps.items()}, + "total_cost": total_cost, + "model_used": model_used, + "changed_files": list(changed_files) if changed_files else [], + "last_saved_at": datetime.now().isoformat(), + "github_comment_id": github_comment_id, + "step_comments": sorted(step_comments_set), + "initial_file_hashes": ( + dict(initial_file_hashes) + if isinstance(initial_file_hashes, dict) + else None + ), + "initial_sha": initial_sha if isinstance(initial_sha, str) else None, + } + + def run_agentic_e2e_fix_orchestrator( issue_url: str, issue_content: str, @@ -2002,6 +2072,7 @@ def run_agentic_e2e_fix_orchestrator( dev_unit_states: Dict[str, Any] = {} skipped_steps: Dict[int, str] = {} github_comment_id: Optional[int] = None + step_comments_set: Set[int] = set() resumed_from_state = False # On resume, restore the workflow-start file snapshot so guards that diff # against it (e.g. NOT_A_BUG direct-edit suppression) keep working. @@ -2058,6 +2129,10 @@ def run_agentic_e2e_fix_orchestrator( if isinstance(saved_sha, str) and saved_sha: resumed_initial_sha = saved_sha + step_comments_set = normalize_step_comments_state( + loaded_state.get("step_comments") + ) + # Issue #1034 (codex P2 follow-up): restore the current cycle's # start snapshot so the resume-time cycle-waste-breaker can prove # no in-cycle progress before authorizing terminal success. @@ -2258,6 +2333,7 @@ def _persist_resume_reverification_state() -> None: "skipped_steps": {str(k): v for k, v in skipped_steps.items()}, "last_saved_at": datetime.now().isoformat(), "github_comment_id": github_comment_id, + "step_comments": sorted(step_comments_set), "initial_file_hashes": dict(initial_file_hashes), "initial_sha": initial_sha, # Persist None when the resumed cycle's baseline is unverified @@ -2690,6 +2766,29 @@ def _persist_resume_reverification_state() -> None: if step_success: step_outputs[str(step_num)] = step_output last_completed_step = step_num + try: + _report_body = extract_step_report(step_output) + if not _report_body: + _report_body = ( + f"_Step {step_num} completed; no `` " + "block returned by agent. Raw output retained in " + "workflow state._" + ) + _step_desc = STEP_DESCRIPTIONS.get(step_num, "") + _comment_body = ( + f"## Step {step_num}/11: {_step_desc}\n\n{_report_body}" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=current_cycle * 10000 + step_num, + body=_comment_body, + posted_steps=step_comments_set, + cwd=cwd, + ) + except Exception as _exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed: {_exc}[/yellow]") else: step_outputs[str(step_num)] = f"FAILED: {step_output}" # Don't update last_completed_step - keep it at previous value @@ -2752,6 +2851,7 @@ def _persist_resume_reverification_state() -> None: "skipped_steps": {str(k): v for k, v in skipped_steps.items()}, "last_saved_at": datetime.now().isoformat(), "github_comment_id": github_comment_id, + "step_comments": sorted(step_comments_set), # Workflow-start snapshot — restored on resume so direct-edit # detection survives interruption. "initial_file_hashes": dict(initial_file_hashes), @@ -3086,6 +3186,7 @@ def _merge_step9_apply(r: _Step9TokenApplyResult) -> None: # Step 11: Code Cleanup (runs BEFORE CI so cleanup is CI-validated) if not skip_cleanup: + _pre_cleanup_files = set(changed_files or []) total_cost, changed_files = _run_step11_code_cleanup( cwd=cwd, issue_number=issue_number, @@ -3102,6 +3203,76 @@ def _merge_step9_apply(r: _Step9TokenApplyResult) -> None: quiet=quiet, reasoning_time=reasoning_time, ) + # Round-2 of Greg's review: extend trusted step-comment + # coverage to the post-loop Step 11 (cleanup) path. The + # helper does its own console output but did not post a + # visible per-step comment through trusted credentials. + # The encoding matches the in-loop scheme: + # `current_cycle * 10000 + 11` so resume/dedup remains + # consistent across cycles. + try: + _post_cleanup_files = set(changed_files or []) + _cleanup_changed = sorted(_post_cleanup_files - _pre_cleanup_files) + if _cleanup_changed: + _step11_summary = ( + "_Step 11 cleanup pass applied changes to: " + f"{', '.join(_cleanup_changed)}._" + ) + else: + _step11_summary = ( + "_Step 11 cleanup pass completed; no additional " + "files were modified (cleanup found nothing to do " + "or reverted itself when verification failed)._" + ) + _step11_desc = STEP_DESCRIPTIONS.get(11, "Code cleanup") + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=current_cycle * 10000 + 11, + body=f"## Step 11/11: {_step11_desc}\n\n{_step11_summary}", + posted_steps=step_comments_set, + cwd=cwd, + ) + # Round-3/4 of Greg's review (idempotency across resume): + # persist `step_comments` *immediately* after the post so + # the composite key (`current_cycle * 10000 + 11`) + # survives the failure-return paths below — specifically + # the `ci_success=False` branch which returns without + # touching workflow state. + # + # Round-4 fix: do NOT mutate the inner-loop `state_data` + # local — on the SUCCESS_FALL_THROUGH / resume-terminal- + # success paths the inner `for step_num in range(1, 10)` + # loop never runs, so `state_data` is never assigned in + # this invocation and the mutation would raise + # `UnboundLocalError` inside the broad try/except. Build + # a fresh post-loop state dict from function-scope + # locals so the save works on every entry path. + save_workflow_state( + cwd, issue_number, workflow_name, + _build_post_loop_state( + workflow_name=workflow_name, + issue_url=issue_url, + issue_number=issue_number, + current_cycle=current_cycle, + last_completed_step=last_completed_step, + step_outputs=step_outputs, + dev_unit_states=dev_unit_states, + skipped_steps=skipped_steps, + total_cost=total_cost, + model_used=model_used, + changed_files=changed_files, + github_comment_id=github_comment_id, + step_comments_set=step_comments_set, + initial_file_hashes=initial_file_hashes, + initial_sha=initial_sha, + ), + state_dir, repo_owner, repo_name, + use_github_state, github_comment_id, + ) + except Exception as _exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed (step 11): {_exc}[/yellow]") if skip_ci: if not quiet: @@ -3127,7 +3298,67 @@ def _merge_step9_apply(r: _Step9TokenApplyResult) -> None: ) total_cost += ci_cost changed_files = _detect_changed_files(cwd, initial_file_hashes) or changed_files + # Round-2 of Greg's review: extend trusted step-comment coverage + # past the linear `range(1, 10)` loop. Step 10 (CI validation) + # runs outside the loop and must also post a visible per-step + # comment via trusted PDD credentials, keyed by + # `current_cycle * 10000 + 10` so the composite-key scheme matches + # the in-loop encoding and dedup survives resume/retry. if ci_success: + try: + _step10_desc = STEP_DESCRIPTIONS.get(10, "CI validation") + _step10_body = ( + f"## Step 10/11: {_step10_desc}\n\n" + f"_Step 10 completed; CI validation finished: {ci_message}_" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=current_cycle * 10000 + 10, + body=_step10_body, + posted_steps=step_comments_set, + cwd=cwd, + ) + # Round-3/4 of Greg's review (idempotency across resume): + # persist `step_comments` *immediately* after the post so + # the composite key (`current_cycle * 10000 + 10`) + # survives the failure-return path below — specifically + # when `_run_final_checkup_on_pr` returns + # `checkup_success=False` and the orchestrator returns + # without clearing or saving state. + # + # Round-4 fix: do NOT mutate the inner-loop `state_data` + # local — on the SUCCESS_FALL_THROUGH / resume-terminal- + # success paths the inner step loop never runs, so + # `state_data` is never assigned and the mutation would + # raise UnboundLocalError. Build a fresh post-loop state + # dict from function-scope locals so the save works on + # every entry path. + save_workflow_state( + cwd, issue_number, workflow_name, + _build_post_loop_state( + workflow_name=workflow_name, + issue_url=issue_url, + issue_number=issue_number, + current_cycle=current_cycle, + last_completed_step=last_completed_step, + step_outputs=step_outputs, + dev_unit_states=dev_unit_states, + skipped_steps=skipped_steps, + total_cost=total_cost, + model_used=model_used, + changed_files=changed_files, + github_comment_id=github_comment_id, + step_comments_set=step_comments_set, + initial_file_hashes=initial_file_hashes, + initial_sha=initial_sha, + ), + state_dir, repo_owner, repo_name, + use_github_state, github_comment_id, + ) + except Exception as _exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed (step 10): {_exc}[/yellow]") if ci_message not in { "No open PR found for current branch; skipping CI validation", "No CI checks detected", @@ -3212,6 +3443,7 @@ def _merge_step9_apply(r: _Step9TokenApplyResult) -> None: "changed_files": changed_files, "last_saved_at": datetime.now().isoformat(), "github_comment_id": github_comment_id, + "step_comments": sorted(step_comments_set), # Issue #1034 codex P2 follow-up: persist workflow-start and # cycle-start snapshots so resume after an interrupt still has # the data needed for direct-edit detection and the cycle-waste- @@ -3263,6 +3495,7 @@ def _merge_step9_apply(r: _Step9TokenApplyResult) -> None: "changed_files": changed_files, "last_saved_at": datetime.now().isoformat(), "github_comment_id": github_comment_id, + "step_comments": sorted(step_comments_set), # Issue #1034 codex P2 follow-up: persist workflow-start and # cycle-start snapshots so resume after a fatal exception # still has the data needed for direct-edit detection and diff --git a/pdd/agentic_test_orchestrator.py b/pdd/agentic_test_orchestrator.py index 53b8d0a86..d922e18ec 100644 --- a/pdd/agentic_test_orchestrator.py +++ b/pdd/agentic_test_orchestrator.py @@ -13,7 +13,7 @@ import subprocess import time from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union from .agentic_common import ( run_agentic_task, @@ -24,6 +24,9 @@ DEFAULT_MAX_RETRIES, set_agentic_progress, clear_agentic_progress, + extract_step_report, + normalize_step_comments_state, + post_step_comment_once, ) from .pytest_output import _find_project_root from .load_prompt_template import load_prompt_template @@ -344,6 +347,9 @@ def run_agentic_test_orchestrator( github_comment_id = None worktree_path = None + step_comments_set: Set[int] = normalize_step_comments_state(state.get("step_comments")) + state["step_comments"] = sorted(step_comments_set) + context: Dict[str, Any] = { "issue_url": issue_url, "issue_content": issue_content, @@ -466,13 +472,56 @@ def run_step( ) return step_success, step_output, step_cost, step_model + def _step_comment_key(step_num: Union[int, float], iteration: int = 1) -> int: + """Project (step_num, iteration) -> deterministic non-negative int. + + Encoding ``iteration * 10000 + int(round(step_num * 10))`` handles + fractional steps (5.5 -> 55) and iterated loop bodies. Result is + unique for iteration in [1, 999] and step_num in [0, 999.9]. + """ + return iteration * 10000 + int(round(float(step_num) * 10)) + + def _maybe_post_step_comment( + step_num: Union[int, float], + description: str, + step_output: str, + iteration: int = 1, + ) -> None: + """Post a trusted per-step success comment; log-and-continue on error.""" + try: + report_body = extract_step_report(step_output) + if not report_body: + report_body = ( + f"_Step {step_num} completed; no `` block " + "returned by agent. Raw output retained in workflow state._" + ) + comment_body = ( + f"## Step {step_num}/18: {description}\n\n{report_body}" + ) + post_step_comment_once( + repo_owner=repo_owner, + repo_name=repo_name, + issue_number=issue_number, + step_num=_step_comment_key(step_num, iteration), + body=comment_body, + posted_steps=step_comments_set, + cwd=current_work_dir, + ) + state["step_comments"] = sorted(step_comments_set) + except Exception as exc: # pylint: disable=broad-except + console.print(f"[yellow]post_step_comment_once failed: {exc}[/yellow]") + def save_state(step_num: Union[int, float], step_output: str, success: bool, - *, completed_step_override: Optional[Union[int, float]] = None) -> None: + *, completed_step_override: Optional[Union[int, float]] = None, + description: Optional[str] = None, + iteration: int = 1) -> None: nonlocal github_comment_id context[f"step{step_num}_output"] = step_output if success: state["step_outputs"][str(step_num)] = step_output state["last_completed_step"] = completed_step_override if completed_step_override is not None else step_num + if description is not None: + _maybe_post_step_comment(step_num, description, step_output, iteration) else: state["step_outputs"][str(step_num)] = f"FAILED: {step_output}" state["total_cost"] = total_cost @@ -540,7 +589,7 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st context["enhanced_test_plan"] = step_output context["step5b_output"] = step_output - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, description=description) if stop_reason: return finish_hard_stop(step_num, stop_reason) @@ -578,7 +627,8 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st total_cost += step_cost model_used = step_model context[f"step{step_num}_output"] = step_output - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, + description="Assess automated test coverage") coverage_gaps = _extract_int_tag(step_output, "COVERAGE_GAPS") if coverage_gaps == 0: run_manual = False @@ -602,7 +652,8 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st total_cost += step_cost model_used = step_model context[f"step{step_num}_output"] = step_output - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, + description="Create manual testing checklist") if not step_success and not quiet: console.print(f"[yellow]Warning: Step {step_num} reported failure but continuing...[/yellow]") if not quiet: @@ -647,7 +698,9 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st total_cost += step_cost model_used = step_model context[f"step{step_num}_output"] = step_output - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, + description=f"Manual browser testing (iteration {iteration})", + iteration=iteration) issues_found = _extract_int_tag(context.get("step8_output", ""), "ISSUES_FOUND") @@ -668,7 +721,9 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st if f not in changed_files: changed_files.append(f) context["files_to_stage"] = ", ".join(changed_files) - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, + description=f"Create regression tests (iteration {iteration})", + iteration=iteration) # Step 10 step_num = 10 @@ -682,7 +737,9 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st total_cost += step_cost model_used = step_model context[f"step{step_num}_output"] = step_output - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, + description=f"Validate regression tests (iteration {iteration})", + iteration=iteration) # Step 11 step_num = 11 @@ -696,7 +753,9 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st total_cost += step_cost model_used = step_model context[f"step{step_num}_output"] = step_output - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, + description=f"Check checklist completion (iteration {iteration})", + iteration=iteration) checklist_status = _extract_tag(step_output, "CHECKLIST_STATUS") or "PARTIAL" if checklist_status.upper() == "COMPLETE": @@ -785,7 +844,7 @@ def finish_hard_stop(step_num: Union[int, float], reason: str) -> Tuple[bool, st if step_num == 13: context["test_results"] = step_output - save_state(step_num, step_output, step_success) + save_state(step_num, step_output, step_success, description=description) stop_reason = _check_hard_stop(step_num, step_output) if stop_reason: diff --git a/pdd/prompts/agentic_change_orchestrator_python.prompt b/pdd/prompts/agentic_change_orchestrator_python.prompt index 1584571c7..9a4b2c963 100644 --- a/pdd/prompts/agentic_change_orchestrator_python.prompt +++ b/pdd/prompts/agentic_change_orchestrator_python.prompt @@ -256,6 +256,28 @@ Track consecutive steps where output contains "All agent providers failed": - Increment counter on provider failure, reset to 0 on any success or non-provider failure - If 3+ consecutive provider failures: post step comment, save state, return `(False, "Aborting: N consecutive steps failed — agent providers unavailable", ...)` +% Per-Step Success Comments (Trusted Posting) + +The orchestrator owns user-visible per-step `## Step N/13:` comments on the GitHub issue, posted via `gh issue comment` with trusted PDD credentials. Step prompts MUST NOT call `gh issue comment` themselves for successful step progress; they emit a single `...` block in their final response, and the orchestrator extracts it and posts it once per step. + +1. **Helpers**: Import from `pdd.agentic_common`: + - `extract_step_report(step_output: str) -> Optional[str]` — returns the trimmed contents of the LAST `` block, or `None` if absent/malformed. Missing or malformed report MUST NOT fail the run; the raw step output is preserved in `state["step_outputs"]` regardless. + - `post_step_comment_once(*, repo_owner, repo_name, issue_number, step_num: int, body: str, posted_steps: Set[int], cwd) -> bool` — redacts secrets, truncates to GitHub's comment cap, posts via `gh issue comment`, and mutates `posted_steps` to add `step_num` on success. No-op when `step_num` is already in `posted_steps`. Returns `True` on success-or-already-posted, `False` on `gh` failure or missing CLI. Keyword-only signature. + - `normalize_step_comments_state(raw) -> Set[int]` — coerces persisted `step_comments` (list/set/None/legacy dict/malformed) into a `Set[int]` of integer step indices for in-memory use. + +2. **State shape**: `state["step_comments"]` persists as a sorted `List[int]` of step indices whose trusted success-comment has been posted. Immediately after `load_workflow_state`, hydrate to `Set[int]` via `normalize_step_comments_state(state.get("step_comments"))` and write `state["step_comments"] = sorted(set)` back so the on-disk form stays canonical. Persist back as a sorted list before every `save_workflow_state` call (normal completion AND hard-stop / failure early-returns). + +3. **Posting protocol** inside the per-step loop (applied to steps 1-10 with `step_num` as the integer key; steps 11/12 use the iteration-keyed encoding below; step 13 uses `step_num=13`): + - On a successful step, compute `report_body = extract_step_report(step_output)`. If `None`, post a short fallback body saying the step completed and raw output is retained in workflow state; do not fail the run. + - Otherwise build the comment body as `f"## Step {step_num}/13: {description}\n\n{report_body}"` and call `post_step_comment_once(repo_owner=..., repo_name=..., issue_number=..., step_num=step_num, body=body, posted_steps=step_comments_set, cwd=current_work_dir)`. + - After the call returns, persist the canonical sorted form: `state["step_comments"] = sorted(step_comments_set)` and rely on the subsequent `save_workflow_state` call to flush it. The set is mutated in place by the helper, so unconditional re-assignment after the call is safe and survives a contract refactor. + - Any exception from `extract_step_report` / `post_step_comment_once` / persistence MUST be log-and-continue (never raise into the orchestrator loop). Use `console.print` for the warning. + +4. **Review loop (steps 11-12)**: iteration-keyed integer encoding `iteration * 100 + step_num` (so iter=1, step=11 -> 111; iter=2, step=11 -> 211; iter=1, step=12 -> 112). This makes re-runs across review iterations get distinct `posted_steps` keys (otherwise the second iteration's report would silently dedupe against the first). + - **REQUIRED success gate**: the `post_step_comment_once` call for Step 11 and Step 12 MUST be guarded by the corresponding `s11_success` / `s12_success` boolean returned by `run_agentic_task`. A failed task (e.g. provider exhaustion) still returns step output, and posting a "Step N completed; no `` block returned" fallback comment on top of that failed output would both mislead the user AND mark the iteration-keyed composite (`iteration * 100 + step_num`) as posted in `state["step_comments"]`. On a later resume/retry, the real successful Step 11/12 report would then be silently deduped away. Treat the success flag as a hard precondition for the trusted-post block; do not relax it to "log-and-continue" — the gate is structural, not a transient error. + +5. **Relationship to the legacy `post_step_comment()` path**: The existing `post_step_comment()` call under `% Early Exit Conditions (Hard Stops)` and `% Consecutive Provider Failure Tracking` is UNCHANGED. Hard-stop failure comments and 3-consecutive-provider-failure abort comments continue to flow through the legacy helper. The new `post_step_comment_once` is strictly additive — it covers the success-comment case that previously had no first-party posting path and was being filled (incorrectly) by the model invoking `gh issue comment` itself. + % Architecture Sanitization (After Step 10) After Step 10 completes, sanitize architecture.json in the worktree: @@ -424,7 +446,7 @@ To avoid re-running completed steps when the workflow is paused (e.g., for clari - On state file read/write errors, log warning and continue without persistence % Dependencies -Import from `agentic_common`: `run_agentic_task`, `DEFAULT_MAX_RETRIES`, `load_workflow_state`, `save_workflow_state`, `clear_workflow_state`, `substitute_template_variables`, `validate_cached_state`, `post_step_comment`, `set_agentic_progress`, `clear_agentic_progress` +Import from `agentic_common`: `run_agentic_task`, `DEFAULT_MAX_RETRIES`, `load_workflow_state`, `save_workflow_state`, `clear_workflow_state`, `substitute_template_variables`, `validate_cached_state`, `post_step_comment`, `set_agentic_progress`, `clear_agentic_progress`, `extract_step_report`, `post_step_comment_once`, `normalize_step_comments_state` Import from `pdd.preprocess`: `preprocess` Import from `pdd.architecture_sync`: `_merge_interface_signatures`, `register_untracked_prompts` Note: `CHANGE_STEP_TIMEOUTS` is defined locally in this module (see Per-Step Timeouts section) diff --git a/pdd/prompts/agentic_checkup_orchestrator_python.prompt b/pdd/prompts/agentic_checkup_orchestrator_python.prompt index d67baf024..7c69ac8ab 100644 --- a/pdd/prompts/agentic_checkup_orchestrator_python.prompt +++ b/pdd/prompts/agentic_checkup_orchestrator_python.prompt @@ -18,6 +18,13 @@ - Step 7 must not post GitHub comments in PR mode. The orchestrator owns posting the canonical PR/issue final report and MUST post it on every PR-mode terminal path (gate pass after push, gate fail, max-iteration exhaustion, registry/prompt-source guard refusals, push failure, post-push reverify abort, `--no-fix` gate pass/fail). `_post_pr_mode_final_report` returns a `status_suffix` string — empty when both posts succeeded, otherwise containing a pointer to `.pdd/checkup-pr-/final-report.md`. Callers MUST append the suffix to whatever message they return so pdd-issue/pdd_cloud surfaces show the partial-post condition. The helper persists `step_outputs["pr_post_status"]` for state inspection. Comment-post failures MUST NOT flip the gate outcome — gh / network flakiness is independent of code-verification truth (round-5 Finding 4). - Skip Step 8 entirely (the PR already exists; the job is to verify and improve it, not open another). 8. Implement per-step timeouts and error handling, including a mechanism to abort if agent providers fail consecutively. + 9. **Trusted Step-Comment Flow**: The orchestrator owns visible per-step `## Step N/8:` comments on the GitHub issue, posted via `gh issue comment` with trusted PDD credentials. Step prompts MUST NOT call `gh issue comment` themselves for successful step progress — they emit a single `...` block in their final response and the orchestrator extracts and posts it once. + - Import from `pdd.agentic_common`: `extract_step_report(step_output) -> Optional[str]`, `post_step_comment_once(*, repo_owner, repo_name, issue_number, step_num: int, body: str, posted_steps: Set[int], cwd) -> bool`, and `normalize_step_comments_state(raw) -> Set[int]`. + - **State shape**: `state["step_comments"]` persists as a sorted `List[int]` on disk; hydrate to `Set[int]` via `normalize_step_comments_state` on resume; persist back as a sorted list via `_build_state(..., step_comments=sorted(step_comments_set))` on every save (success, hard-stop, abort). + - **Composite-key encoding (REQUIRED — helper is `Set[int]`-typed)**: the checkup workflow has fractional steps (6.1, 6.2, 6.3) and an iterated fix-verify loop. Project each (step_num, iteration) tuple to a deterministic non-negative int via `iteration * 10000 + int(round(step_num * 10))`. Use `iteration=1` for non-iterative steps (1, 2, 8); pass the actual `fix_verify_iteration` for the loop-body steps (3, 4, 5, 6.1, 6.2, 6.3, 7). + - **Posting protocol** inside `_handle_step_result`: when `description` is provided AND the step succeeded, compute `report_body = extract_step_report(step_output)`; if `None`, use a short fallback body saying the step completed and raw output is retained in workflow state. Build `f"## Step {step_num}/8: {description}\n\n{report_body}"` and call `post_step_comment_once(...)` with the composite key. Any exception MUST be log-and-continue (never raise into the orchestrator loop). + - **PR-mode**: in PR-verification mode the trusted helper still posts to the GitHub *issue* thread (`post_step_comment_once` calls `gh issue comment`). It does NOT replace the Step 7 PR-thread comment routing (`gh pr comment`). + - **Relationship to legacy `post_step_comment`**: hard-stop / failure comments continue to flow through the legacy helper unchanged. `post_step_comment_once` is strictly additive — it covers the success-comment case that previously had no first-party posting path. 3) Dependencies: diff --git a/pdd/prompts/agentic_common_python.prompt b/pdd/prompts/agentic_common_python.prompt index f228c08b2..db1226845 100644 --- a/pdd/prompts/agentic_common_python.prompt +++ b/pdd/prompts/agentic_common_python.prompt @@ -104,8 +104,14 @@ Shared infrastructure for agentic CLI invocations (Claude Code, Gemini, Codex, O - `get_and_clear_agentic_interrupt_context() -> Optional[Dict]`: Retrieve and clear progress for Ctrl+C reporting. 17. **Template Variable Substitution**: `substitute_template_variables(template, variables) -> str`: Safe substitution that handles curly braces in LLM output without raising KeyError. 18. **Post Final Comment**: `post_final_comment(repo_owner, repo_name, issue_number, reason, total_cost, steps_completed, total_steps, cwd) -> bool`: Post a generated workflow summary comment to the GitHub issue when the workflow stops early. The function builds the comment body from the stop reason, cumulative cost, and completed/total step counts; callers do not pass a preformatted body. -19. **OpenCode Model Resolution**: Resolve the OpenCode model in this order: (1) `OPENCODE_MODEL` env var, kept verbatim including nested slashes like `openrouter/openai/gpt-5.3-codex`; (2) derive a candidate from `llm_model.csv` using PDD's existing model-strength semantics, then translate LiteLLM-oriented IDs via `_translate_to_opencode_model()`. The CSV fallback MUST be auth-aware: build the configured OpenCode provider set from parsed provider credentials in `~/.local/share/opencode/auth.json`, parsed usable OpenCode config provider/model entries (`~/.config/opencode/opencode.json`, nearest project `opencode.json`, `OPENCODE_CONFIG`, `OPENCODE_CONFIG_CONTENT`), and every provider credential env var represented in `llm_model.csv`; filter candidate rows to providers that are configured before selecting a model. OpenCode config sources contribute a configured provider only when they declare a provider/model path with resolvable auth or explicit local/no-key provider semantics; bare config existence is diagnostic-only. OpenCode agentic runs use `OPENCODE_MODEL` or the auth-aware CSV fallback, not generic direct-prompt model defaults. Required translations include `github_copilot/X -> github-copilot/X`, `gemini/X -> google/X`, bare Anthropic rows like `claude-sonnet-... -> anthropic/claude-sonnet-...`, and bare OpenAI rows like `gpt-5 -> openai/gpt-5`; IDs already in OpenCode `provider/model` form pass through unchanged. If no configured provider can serve the selected model, fail fast with an actionable error telling the user to set `OPENCODE_MODEL=provider/model`, configure the matching provider, or run `opencode models` after authentication. Do not rely on OpenCode default model resolution. -20. **OpenCode Optional Knobs**: Honor `OPENCODE_AGENT` by passing `--agent ` and `OPENCODE_VARIANT` by passing `--variant ` when set. Omit both flags when unset. `PDD_OPENCODE_MODE` is out of scope for this module version; use `opencode run` only. +19. **Trusted Step-Comment Helpers**: Orchestrators (not LLM step prompts) own visible per-step `## Step N/T:` comments on the GitHub issue, posted via `gh issue comment` with trusted PDD credentials. Step prompts emit a `...` block in their final response; the orchestrator extracts it, sanitizes/truncates, and posts once. Three helpers form the contract: + - `extract_step_report(text: Optional[str]) -> Optional[str]`: returns the trimmed contents of the LAST `...` block in `text` (DOTALL, case-insensitive). Returns `None` for empty input or when the block is missing/malformed. Missing or malformed report MUST NOT fail the workflow — orchestrators still post a short fallback visible comment, and the raw step output is preserved in `state["step_outputs"]` regardless. A private `_extract_step_report` alias is exposed for internal callers, and a module-level `_STEP_REPORT_RE` is the compiled pattern. + - `post_step_comment_once(*, repo_owner, repo_name, issue_number, step_num: int, body: str, posted_steps: Set[int], cwd: Path) -> bool`: idempotent trusted post. If `step_num` is already in `posted_steps`, returns `True` immediately without invoking `gh`. Otherwise applies the same redaction + truncation pass used by `_sanitize_comment_body`, shells out to `gh issue comment --repo {owner}/{name} --body ` from `cwd`, and on success mutates `posted_steps` in place (adds `step_num`). Returns `True` on success-or-already-posted, `False` on missing `gh` CLI or non-zero exit. Never raises; transient failures are logged via `console.print` and surfaced as `False`. Keyword-only signature — orchestrators must call by keyword to avoid positional drift. + - `normalize_step_comments_state(raw: Any) -> Set[int]`: coerces the persisted `state["step_comments"]` value into an in-memory `Set[int]`. Accepts (a) `None` / missing key -> empty set; (b) list/tuple/set/frozenset of ints (or strings convertible to int) -> set; (c) the legacy bug-orchestrator dict shape `{"": {"posted": True}, ...}` -> set of int keys whose `posted` *or* `failed_posted` is `True` (so failure-comment posts also count as "already on GitHub"); (d) malformed / unexpected inputs -> empty set. Never raises. `bool` and `float` values are rejected as step numbers (bool is an int subclass, but always a flag in legacy state; float requires composite-key projection at the orchestrator before persistence). + - **Redaction/truncation pipeline** (used by `post_step_comment_once`): `_SECRET_PATTERNS` covers GitHub PATs (`ghp_`, `github_pat_`, `gho_`/`ghu_`/`ghs_`/`ghr_`), Google AI keys (`AIza`), xAI (`xai-`), OpenAI (`sk-`), Groq (`gsk_`). `_ENV_TOKEN_RE` redacts `GH_TOKEN=...` / `GITHUB_TOKEN=...`. `_BEARER_TOKEN_RE` redacts `Authorization: Bearer `. After redaction, bodies longer than `_COMMENT_MAX_CHARS = 25000` are truncated with a `\n\n…[truncated]` marker; the marker length is reserved so the final body never exceeds the cap. Idempotent and conservative — never raises. + - **State shape contract**: `state["step_comments"]` persists as a sorted `List[int]` on disk. Orchestrators rehydrate to `Set[int]` via `normalize_step_comments_state` immediately after `load_workflow_state` and write back `sorted(set)` before every `save_workflow_state` call (success path, hard-stop early return, provider-failure abort). For orchestrators with fractional or iterated steps (checkup, test) and orchestrators with cycle loops (e2e_fix), each (step_num, iteration) or (step_num, cycle) tuple is projected to a deterministic non-negative int key before being passed as `step_num=` to the helper — the projection formula is per-orchestrator (documented in their own prompts). +20. **OpenCode Model Resolution**: Resolve the OpenCode model in this order: (1) `OPENCODE_MODEL` env var, kept verbatim including nested slashes like `openrouter/openai/gpt-5.3-codex`; (2) derive a candidate from `llm_model.csv` using PDD's existing model-strength semantics, then translate LiteLLM-oriented IDs via `_translate_to_opencode_model()`. The CSV fallback MUST be auth-aware: build the configured OpenCode provider set from parsed provider credentials in `~/.local/share/opencode/auth.json`, parsed usable OpenCode config provider/model entries (`~/.config/opencode/opencode.json`, nearest project `opencode.json`, `OPENCODE_CONFIG`, `OPENCODE_CONFIG_CONTENT`), and every provider credential env var represented in `llm_model.csv`; filter candidate rows to providers that are configured before selecting a model. OpenCode config sources contribute a configured provider only when they declare a provider/model path with resolvable auth or explicit local/no-key provider semantics; bare config existence is diagnostic-only. OpenCode agentic runs use `OPENCODE_MODEL` or the auth-aware CSV fallback, not generic direct-prompt model defaults. Required translations include `github_copilot/X -> github-copilot/X`, `gemini/X -> google/X`, bare Anthropic rows like `claude-sonnet-... -> anthropic/claude-sonnet-...`, and bare OpenAI rows like `gpt-5 -> openai/gpt-5`; IDs already in OpenCode `provider/model` form pass through unchanged. If no configured provider can serve the selected model, fail fast with an actionable error telling the user to set `OPENCODE_MODEL=provider/model`, configure the matching provider, or run `opencode models` after authentication. Do not rely on OpenCode default model resolution. +21. **OpenCode Optional Knobs**: Honor `OPENCODE_AGENT` by passing `--agent ` and `OPENCODE_VARIANT` by passing `--variant ` when set. Omit both flags when unset. `PDD_OPENCODE_MODE` is out of scope for this module version; use `opencode run` only. % Function Signatures `get_agent_provider_preference() -> List[str]` @@ -126,8 +132,11 @@ Shared infrastructure for agentic CLI invocations (Claude Code, Gemini, Codex, O `get_and_clear_agentic_interrupt_context() -> Optional[Dict]` `get_job_deadline() -> Optional[float]` `post_step_comment(repo_owner, repo_name, issue_number, step_num, total_steps, description, output, cwd, body=None) -> bool` — Issue #964: when `body` is provided (orchestrator extracted a `` block from a successful run), strip any leading duplicate `## Step ` header from the model body, sanitize via `_sanitize_comment_body`, and wrap in the orchestrator's canonical header. When `body=None`, retain the legacy FAILED fallback template using `output` (used by hard-stop callers in `agentic_change_orchestrator`). -`_extract_step_report(text: Optional[str]) -> Optional[str]` — Extracts the LAST `...` block (DOTALL + case-insensitive). Returns `None` when text is empty or no tagged block is present; otherwise the body is whitespace-stripped. Used by orchestrators to gate orchestrator-owned posting AND to gate the resume-time backfill sweep (legacy pre-PR outputs and synthetic FAST_TRACK skipped-step outputs both lack the tag and must NOT be reposted). +`_extract_step_report(text: Optional[str]) -> Optional[str]` — Extracts the LAST `...` block (DOTALL + case-insensitive). Returns `None` when text is empty or no tagged block is present; otherwise the body is whitespace-stripped. Orchestrators treat `None` as a successful-step fallback case: post a short visible comment that raw output is retained in workflow state, then preserve the raw output in state. `_sanitize_comment_body(body: Optional[str], max_chars: int = 25_000) -> str` — Redacts known secret formats (GitHub tokens, Google API keys, OpenAI/xAI/Groq keys, `GH_TOKEN=` / `GITHUB_TOKEN=` env, Bearer headers) then truncates to `max_chars`. Truncation reserves room for the `…[truncated]` marker so the returned length never exceeds `max_chars`. Idempotent. +`extract_step_report(text: Optional[str]) -> Optional[str]` — Public alias for `_extract_step_report`; orchestrators import this name for clarity at the call site. +`post_step_comment_once(*, repo_owner: str, repo_name: str, issue_number: int, step_num: int, body: str, posted_steps: Set[int], cwd: Path) -> bool` — Idempotent trusted post via `gh issue comment`. Returns `True` immediately if `step_num` is already in `posted_steps`; otherwise sanitizes/truncates `body`, posts, and mutates `posted_steps` in place on success. Keyword-only signature. Never raises; logs and returns `False` on failure. +`normalize_step_comments_state(raw: Any) -> Set[int]` — Coerces a persisted `state["step_comments"]` value (list/tuple/set/frozenset/legacy-dict/None/garbage) into a `Set[int]` of step keys. Never raises. Rejects `bool` (legacy flag) and `float` (requires composite-key projection at the orchestrator). `post_pr_comment(repo_owner, repo_name, pr_number, body, cwd) -> bool` `post_final_comment(repo_owner, repo_name, issue_number, reason, total_cost, steps_completed, total_steps, cwd) -> bool` `validate_cached_state(last_completed_step, step_outputs, step_order=None, quiet=False) -> Union[int, float]` diff --git a/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt b/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt index 71f12085c..16532653f 100644 --- a/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt +++ b/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt @@ -130,6 +130,21 @@ On the first cycle, look for a sibling `bug_state_.json` under the state - Both `step_outputs["3"]` and `bug_step_outputs["3"]` entry vectors MUST be covered symmetrically: the same re-evaluation logic, demotion token, and guard semantics apply to either cache source, since `bug_step_outputs` can be pre-populated from a sibling `bug_state_.json` (see "Reuse pdd-bug Analysis" above). - Legacy-state baseline guard: when resuming with `last_completed_step > 0` and `current_cycle > 1` from a state file that has NO persisted `cycle_start_hashes` (a legacy state file from before Issue #1034 persistence, or a pre-snapshot-save interrupt), the orchestrator MUST set an internal `cycle_baseline_unverified = True` flag. The outer loop's cycle-entry capture (`cycle_start_hashes = _get_file_hashes(cwd)`) would otherwise produce a post-edit fresh snapshot, making `_detect_meaningful_changes(cwd, cycle_start_hashes)` falsely report "no in-cycle progress" and letting the INLINE cycle-waste-breaker terminal-success on a baseline it cannot verify (even though the resume-time helper correctly demoted via the missing-snapshot branch and Step 3 then reruns). The inline cycle-waste-breaker (Step 3 NOT_A_BUG handler in the inner loop) MUST refuse terminal-success when `cycle_baseline_unverified` is True — the cycle falls through to "NOT_A_BUG ignored" and Step 9 decides loop control normally. The flag is cleared at the next legitimate cycle rollover so subsequent cycles use their fresh, verifiable baseline. CRITICAL: while `cycle_baseline_unverified` is True, ALL state save sites (main inner-loop save, post-Step-9 retry save, KeyboardInterrupt handler, fatal-Exception handler) MUST persist `cycle_start_hashes` as `None` rather than the in-memory local. Saving the fresh post-edit local would let the very next resume restore it as a "valid" snapshot, clear the unverified flag (which is in-memory only), and immediately terminal-success on a baseline that is still unverified. Persist-as-None forces the next resume's missing-snapshot path to re-set the flag. +% Trusted Step-Comment Flow + +The orchestrator owns user-visible per-step `## Step N/11:` comments on the GitHub issue, posted via `gh issue comment` with trusted PDD credentials. Step prompts MUST NOT call `gh issue comment` themselves for successful step progress — they emit a `...` block in their final response and the orchestrator extracts and posts it once. + +- Import from `pdd.agentic_common`: `extract_step_report(step_output) -> Optional[str]`, `post_step_comment_once(*, repo_owner, repo_name, issue_number, step_num: int, body: str, posted_steps: Set[int], cwd) -> bool`, and `normalize_step_comments_state(raw) -> Set[int]`. +- **State shape**: persist `state["step_comments"]` as a sorted `List[int]` (alongside `step_outputs`, `last_completed_step`, etc.). On resume, hydrate via `normalize_step_comments_state(loaded_state.get("step_comments"))` into a `Set[int]`. Persist back as a sorted list at every state save (success-path inner save, KeyboardInterrupt save, fatal-Exception save). +- **Cycle-keyed encoding (REQUIRED — helper is `Set[int]`-typed)**: the e2e_fix workflow has 11 integer steps inside a cycle loop. Project each (step_num, current_cycle) tuple to a deterministic non-negative int via `current_cycle * 10000 + step_num`. `current_cycle` is normalized to >=1 before the inner step loop. This guarantees re-runs across cycles emit distinct comments rather than dedupe. +- **Posting protocol** inside the step loop (steps 1-9), after a successful `run_agentic_task` returns: compute `report_body = extract_step_report(step_output)`; if `None`, use a short fallback body saying the step completed and raw output is retained in workflow state. Build `f"## Step {step_num}/11: {STEP_DESCRIPTIONS[step_num]}\n\n{report_body}"` and call `post_step_comment_once(..., step_num=cycle*10000+step_num, body=body, posted_steps=step_comments_set, cwd=cwd)`. Any exception MUST be log-and-continue. +- **Post-loop Step 10 (CI validation)**: immediately after `run_ci_validation_loop(...)` returns `ci_success=True`, post a trusted comment via `post_step_comment_once(..., step_num=current_cycle*10000+10, body=f"## Step 10/11: {STEP_DESCRIPTIONS[10]}\n\n_Step 10 completed; CI validation finished: {ci_message}_", posted_steps=step_comments_set, cwd=cwd)`. The same cycle-keyed encoding applies so resume across cycles emits distinct comments. Skip the post when `ci_success` is False; the legacy `post_final_comment` failure path handles the user-visible failure message. Wrap in `try/except` and log-and-continue on helper failure. +- **Post-loop Step 11 (cleanup)**: snapshot `_pre_cleanup_files = set(changed_files or [])` BEFORE calling `_run_step11_code_cleanup`. After it returns, compute `_cleanup_changed = sorted(set(changed_files or []) - _pre_cleanup_files)` and build the body as `## Step 11/11: {STEP_DESCRIPTIONS[11]}\n\n` followed by either `_Step 11 cleanup pass applied changes to: {', '.join(_cleanup_changed)}._` when files were added or `_Step 11 cleanup pass completed; no additional files were modified (cleanup found nothing to do or reverted itself when verification failed)._` otherwise. Call `post_step_comment_once(..., step_num=current_cycle*10000+11, body=body, ...)`. Wrap in `try/except` and log-and-continue. Reasoning: Steps 10 and 11 are part of the claimed `## Step N/11:` coverage and MUST emit their trusted-credential comments through this helper — leaving them silent would mean the orchestrator's "/11:" header overstates real visibility coverage and the underlying contract that step prompts MUST NOT call `gh issue comment` themselves would have no first-party replacement for these two steps. +- **REQUIRED post-loop persistence (idempotency across resume/retry)**: each Step 10 / Step 11 trusted post MUST be IMMEDIATELY followed by a `save_workflow_state(...)` call that flushes the updated `step_comments` (`sorted(step_comments_set)`) to disk BEFORE any subsequent failure-return path runs. The downstream `run_ci_validation_loop` (`ci_success=False`) and `_run_final_checkup_on_pr` (`checkup_success=False`) paths both `return False, ...` without touching workflow state, so without an inline save the saved state will not contain the just-posted composite key. On resume the rehydrated `step_comments_set` then misses it and `post_step_comment_once` re-posts a duplicate. Wrap the persistence call in the same `try/except` block that guards the post so a save failure logs-and-continues rather than aborting the workflow. +- **Resume-safe state construction (REQUIRED)**: build the dict passed to `save_workflow_state` from function-scope locals ONLY (`step_outputs`, `step_comments_set`, `changed_files`, `total_cost`, `model_used`, `dev_unit_states`, `skipped_steps`, `github_comment_id`, `initial_file_hashes`, `initial_sha`, etc.) — NOT by mutating the inner-loop `state_data` local. On the SUCCESS_FALL_THROUGH / resume-terminal-success paths the inner `for step_num in range(1, 10)` loop never runs, so the inner-loop `state_data` is never assigned in this invocation. Mutating it would raise `UnboundLocalError` inside the broad try/except around the post, get swallowed as a "post_step_comment_once failed" warning, and leave the composite key unpersisted on disk — a later resume re-posts the same Step 10/11 visible comment, reopening the idempotency hole on exactly the resume paths this state save is meant to protect. Factor the dict build into a small module-level helper (e.g. `_build_post_loop_state(*, workflow_name, issue_url, issue_number, current_cycle, last_completed_step, step_outputs, dev_unit_states, skipped_steps, total_cost, model_used, changed_files, github_comment_id, step_comments_set, initial_file_hashes, initial_sha) -> Dict[str, Any]`) so the linear-execution path and both resume paths use the same shape. +- **REQUIRED success gate inside the loop**: the trusted `post_step_comment_once` call MUST live inside the `if step_success:` branch (the same branch that updates `step_outputs[str(step_num)] = step_output` and advances `last_completed_step`). A failed step that posts a fallback comment would burn the composite key `current_cycle*10000+step_num` and dedupe the real successful retry away. +- **Relationship to legacy `post_final_comment`**: workflow-level final messages (NOT_A_BUG, MAX_CYCLES_REACHED, missing loop-control tokens) continue to flow through `post_final_comment` unchanged. `post_step_comment_once` is strictly additive for the success-step case. + % Commit / Push (push_with_retry) After a successful inner loop, `_commit_and_push` stages only files whose hashes differ from `initial_file_hashes` (filtered through `_is_intermediate_file` to drop `*_fixed.*`, `*.bak`, `error_output*.txt`, `.pdd/**` except `e2e-fix-state/`, `step*_output.md`, `test_issue_*_reproduction.py`, `.gh-wrapper/**` executor wrapper artifacts, etc.), commits as `"fix: {issue_title}\n\nFixes #{issue_number}"`, and pushes via `push_with_retry`. The `.gh-wrapper/**` filter (Issue #1001) MUST be applied in both the hash-diff staging path and any fallback `_get_modified_and_untracked` staging path so the executor's `gh`/`git` shim binaries (e.g. `.gh-wrapper/gh`, `.gh-wrapper/git`) never leak into the fix commit; matching MUST be platform-neutral (normalize `\\` → `/`) and anchored on the `.gh-wrapper/` directory boundary so legitimate paths containing the substring `gh-wrapper` (e.g. `gh-wrapper-docs.md`, `tools/gh_wrapper.py`) are not over-filtered. `.gh-wrapper/` is already in the project `.gitignore` but staging code must not rely on that alone (`git add ` bypasses gitignore). diff --git a/pdd/prompts/agentic_test_orchestrator_python.prompt b/pdd/prompts/agentic_test_orchestrator_python.prompt index daed58bfa..ee5a5f6a8 100644 --- a/pdd/prompts/agentic_test_orchestrator_python.prompt +++ b/pdd/prompts/agentic_test_orchestrator_python.prompt @@ -25,6 +25,12 @@ The module is an **Agentic Test Orchestrator** for an 18-step automated test gen 6. **Parallel/Cloud Integration**: Support an optional cloud-based manual testing execution by polling for results in a JSON file if `PDD_CLOUD_RUN` is enabled. 7. **Progress Reporting**: Use `set_agentic_progress` to provide real-time updates on step completion and `rich` for console output. 8. **Resilience**: Implement timeouts per step (mapped in `TEST_STEP_TIMEOUTS`) and utilize a retry mechanism for agentic tasks. +9. **Trusted Step-Comment Flow**: The orchestrator owns visible per-step `## Step N/18:` comments on the GitHub issue, posted via `gh issue comment` with trusted PDD credentials. Step prompts MUST NOT call `gh issue comment` themselves for successful progress — they emit a `...` block in their final response and the orchestrator extracts and posts it once. + - Import from `pdd.agentic_common`: `extract_step_report(step_output) -> Optional[str]`, `post_step_comment_once(*, repo_owner, repo_name, issue_number, step_num: int, body: str, posted_steps: Set[int], cwd) -> bool`, and `normalize_step_comments_state(raw) -> Set[int]`. + - **State shape**: `state["step_comments"]` persists as a sorted `List[int]` on disk; hydrate to `Set[int]` via `normalize_step_comments_state` on resume; persist back as a sorted list before every `save_workflow_state` call (success path, hard-stop early-return, abort). + - **Composite-key encoding (REQUIRED — helper is `Set[int]`-typed)**: the test workflow has fractional steps (5.5) and an iterated manual-testing loop (steps 8/9/10/11 across `iteration in 1..3`). Project each (step_num, iteration) tuple to a deterministic non-negative int via `iteration * 10000 + int(round(step_num * 10))`. Use `iteration=1` for non-iterative steps; pass the actual loop iteration for the manual-testing-loop steps. This guarantees re-runs across iterations get distinct keys and resume-replay reproduces the same key. + - **Posting protocol** inside `save_state`: on success and when `description` is provided, compute `report_body = extract_step_report(step_output)`; if `None`, use a short fallback body saying the step completed and raw output is retained in workflow state. Build `f"## Step {step_num}/18: {description}\n\n{report_body}"` and call `post_step_comment_once(...)`. Persist `state["step_comments"] = sorted(step_comments_set)` so the on-disk shape stays canonical. Any exception from extract / post / persist MUST be log-and-continue. + - **Relationship to legacy `post_step_comment`**: hard-stop / failure comments continue to flow through the legacy helper unchanged. `post_step_comment_once` is strictly additive — it covers the success-comment case the model used to fill (incorrectly) by invoking `gh issue comment` itself. # Dependencies @@ -52,4 +58,4 @@ The module is an **Agentic Test Orchestrator** for an 18-step automated test gen 7. **Output Formatting**: Ensure all console output uses the `rich` library for consistent styling. # Deliverable -A Python module named `agentic_test_orchestrator.py` containing the primary entry point `run_agentic_test_orchestrator`. The function must return a tuple: `(success: bool, message: str, total_cost: float, model: str, changed_files: List[str])`. \ No newline at end of file +A Python module named `agentic_test_orchestrator.py` containing the primary entry point `run_agentic_test_orchestrator`. The function must return a tuple: `(success: bool, message: str, total_cost: float, model: str, changed_files: List[str])`. diff --git a/tests/test_agentic_change_orchestrator.py b/tests/test_agentic_change_orchestrator.py index 3d77ce6e9..79d23d803 100644 --- a/tests/test_agentic_change_orchestrator.py +++ b/tests/test_agentic_change_orchestrator.py @@ -5306,3 +5306,301 @@ def test_handles_filename_with_arrow_substring(self, tmp_path): assert expected in files, ( f"Filename containing ' -> ' was mis-parsed; got {files!r}" ) + + +# --------------------------------------------------------------------------- +# Trusted step-comment wiring +# --------------------------------------------------------------------------- + + +class TestTrustedStepCommentPosting: + """The change orchestrator must extract from each successful + step's output and post via post_step_comment_once with step_num for linear + steps and `iter * 100 + step_num` for review-loop steps 11/12.""" + + def test_success_path_posts_step_comment_once(self, mock_dependencies, temp_cwd): + mocks = mock_dependencies + mock_run = mocks["run"] + + def side_effect_run(**kwargs): + label = kwargs.get("label", "") + if label == "step9": + return ( + True, + "step 9\nFILES_MODIFIED: file_a.py", + 0.5, "gpt-4", + ) + if label == "step10": + return ( + True, + "step 10\nARCHITECTURE_FILES_MODIFIED: arch.json", + 0.1, "gpt-4", + ) + if label.startswith("step11"): + return (True, "step 11\nNo Issues Found", 0.1, "gpt-4") + if label == "step13": + return ( + True, + "step 13\nPR Created: https://github.com/o/r/pull/1", + 0.2, "gpt-4", + ) + return (True, f"step report {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect_run + + with patch( + "pdd.agentic_change_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + success, _, _, _, _ = run_agentic_change_orchestrator( + issue_url="http://url", + issue_content="Fix bug", + repo_owner="owner", + repo_name="repo", + issue_number=1, + issue_author="me", + issue_title="Bug fix", + cwd=temp_cwd, + quiet=True, + ) + + assert success is True + assert mock_post_once.call_count >= 10 + for c in mock_post_once.call_args_list: + assert "step_num" in c.kwargs + assert isinstance(c.kwargs["step_num"], int) + assert "posted_steps" in c.kwargs + + def test_step_report_missing_posts_fallback_comment(self, mock_dependencies, temp_cwd): + mocks = mock_dependencies + mock_run = mocks["run"] + + def side_effect_run(**kwargs): + label = kwargs.get("label", "") + if label == "step9": + return (True, "FILES_MODIFIED: file_a.py", 0.5, "gpt-4") + if label == "step10": + return (True, "ARCHITECTURE_FILES_MODIFIED: arch.json", 0.1, "gpt-4") + if label.startswith("step11"): + return (True, "No Issues Found", 0.1, "gpt-4") + if label == "step13": + return (True, "PR Created: https://github.com/o/r/pull/1", 0.2, "gpt-4") + return (True, f"Output for {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect_run + + with patch( + "pdd.agentic_change_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + success, _, _, _, _ = run_agentic_change_orchestrator( + issue_url="http://url", + issue_content="Fix bug", + repo_owner="owner", + repo_name="repo", + issue_number=1, + issue_author="me", + issue_title="Bug fix", + cwd=temp_cwd, + quiet=True, + ) + + assert success is True + assert mock_post_once.call_count >= 10 + assert any( + "no `` block returned by agent" in c.kwargs["body"] + and "Raw output retained in workflow state" in c.kwargs["body"] + for c in mock_post_once.call_args_list + ) + + def test_post_exception_does_not_break_run(self, mock_dependencies, temp_cwd): + mocks = mock_dependencies + mock_run = mocks["run"] + + def side_effect_run(**kwargs): + label = kwargs.get("label", "") + if label == "step9": + return ( + True, + "step 9\nFILES_MODIFIED: file_a.py", + 0.5, "gpt-4", + ) + if label == "step10": + return ( + True, + "step 10\nARCHITECTURE_FILES_MODIFIED: arch.json", + 0.1, "gpt-4", + ) + if label.startswith("step11"): + return (True, "step 11\nNo Issues Found", 0.1, "gpt-4") + if label == "step13": + return ( + True, + "step 13\nPR Created: https://github.com/o/r/pull/1", + 0.2, "gpt-4", + ) + return (True, f"step report {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect_run + + with patch( + "pdd.agentic_change_orchestrator.post_step_comment_once", + side_effect=RuntimeError("simulated gh failure"), + ): + success, _, _, _, _ = run_agentic_change_orchestrator( + issue_url="http://url", + issue_content="Fix bug", + repo_owner="owner", + repo_name="repo", + issue_number=1, + issue_author="me", + issue_title="Bug fix", + cwd=temp_cwd, + quiet=True, + ) + + assert success is True + + def test_step11_failure_does_not_post_or_mark_iter_key( + self, mock_dependencies, temp_cwd + ): + """Regression for Greg's PR review (round 2). + + Step 11 (review) inside the iterated review-loop must NOT post a + trusted step-comment nor add its composite key to + ``state["step_comments"]`` when ``s11_success`` is False. Otherwise + a later successful retry of the same iteration's Step 11 would be + deduped against the failed run's fallback "Step 11 completed" key + and the user would see no real Step 11 report. + """ + mocks = mock_dependencies + mock_run = mocks["run"] + + def side_effect_run(**kwargs): + label = kwargs.get("label", "") + if label == "step9": + return ( + True, + "step 9\nFILES_MODIFIED: file_a.py", + 0.5, "gpt-4", + ) + if label == "step10": + return ( + True, + "step 10\nARCHITECTURE_FILES_MODIFIED: arch.json", + 0.1, "gpt-4", + ) + if label.startswith("step11"): + # Step 11 FAILS — provider exhausted, raw text still returned. + # The output deliberately includes "No Issues Found" so the + # legacy review-loop short-circuits and we don't need to mock + # downstream steps. Even with that sentinel, the trusted post + # must be skipped because s11_success is False. + return (False, "Provider failure; No Issues Found", 0.1, "gpt-4") + if label == "step13": + return ( + True, + "step 13\nPR Created: https://github.com/o/r/pull/1", + 0.2, "gpt-4", + ) + return (True, f"step report {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect_run + + with patch( + "pdd.agentic_change_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + run_agentic_change_orchestrator( + issue_url="http://url", + issue_content="Fix bug", + repo_owner="owner", + repo_name="repo", + issue_number=1, + issue_author="me", + issue_title="Bug fix", + cwd=temp_cwd, + quiet=True, + ) + + # Composite key for review_iteration=1, step=11 is 111. It MUST NOT + # appear in any post_step_comment_once call because Step 11 failed. + step_nums = [ + c.kwargs.get("step_num") + for c in mock_post_once.call_args_list + ] + assert 111 not in step_nums, ( + "Failed Step 11 should not have posted a fallback comment " + f"or burned composite key 111. Saw: {step_nums}" + ) + + def test_step12_failure_does_not_post_or_mark_iter_key( + self, mock_dependencies, temp_cwd + ): + """Same regression as above but for Step 12 (fix). + + When ``s12_success`` is False the fix task didn't actually complete, + so the trusted Step 12 post must be skipped and key + ``review_iteration*100 + 12`` must remain unposted. + """ + mocks = mock_dependencies + mock_run = mocks["run"] + + def side_effect_run(**kwargs): + label = kwargs.get("label", "") + if label == "step9": + return ( + True, + "step 9\nFILES_MODIFIED: file_a.py", + 0.5, "gpt-4", + ) + if label == "step10": + return ( + True, + "step 10\nARCHITECTURE_FILES_MODIFIED: arch.json", + 0.1, "gpt-4", + ) + if label.startswith("step11"): + # Step 11 succeeds but reports issues, driving the loop into + # Step 12 where the failure happens. + return ( + True, + "step 11 found issues\nIssues Found", + 0.1, "gpt-4", + ) + if label.startswith("step12"): + return (False, "Provider failure during fix", 0.1, "gpt-4") + if label == "step13": + return ( + True, + "step 13\nPR Created: https://github.com/o/r/pull/1", + 0.2, "gpt-4", + ) + return (True, f"step report {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect_run + + with patch( + "pdd.agentic_change_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + run_agentic_change_orchestrator( + issue_url="http://url", + issue_content="Fix bug", + repo_owner="owner", + repo_name="repo", + issue_number=1, + issue_author="me", + issue_title="Bug fix", + cwd=temp_cwd, + quiet=True, + ) + + step_nums = [ + c.kwargs.get("step_num") + for c in mock_post_once.call_args_list + ] + assert 112 not in step_nums, ( + "Failed Step 12 should not have posted a fallback comment " + f"or burned composite key 112. Saw: {step_nums}" + ) diff --git a/tests/test_agentic_checkup_orchestrator.py b/tests/test_agentic_checkup_orchestrator.py index 544c1d72e..892c7badf 100644 --- a/tests/test_agentic_checkup_orchestrator.py +++ b/tests/test_agentic_checkup_orchestrator.py @@ -1868,3 +1868,73 @@ def test_resume_between_iterations(self, mock_dependencies, default_args, tmp_pa assert "step8" in called_labels # Should NOT have any iter1 labels (all were completed before resume) assert not any("iter1" in lbl for lbl in called_labels) + + +# --------------------------------------------------------------------------- +# Trusted step-comment wiring +# --------------------------------------------------------------------------- + + +class TestTrustedStepCommentPosting: + """The checkup orchestrator must extract from each successful + step's output and post via post_step_comment_once with a composite-key + encoding (fractional 6.1/6.2/6.3 and iterated fix-verify loop).""" + + def test_success_path_posts_step_comment_once(self, mock_dependencies, default_args): + mock_run, _, _, _ = mock_dependencies + mock_run.return_value = ( + True, + f"step report body\n{ALL_ISSUES_FIXED}", + 0.1, + "gpt-4", + ) + + with patch( + "pdd.agentic_checkup_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + success, _, _, _ = run_agentic_checkup_orchestrator(**default_args) + + assert success is True + assert mock_post_once.call_count >= 1 + for c in mock_post_once.call_args_list: + assert "step_num" in c.kwargs + assert isinstance(c.kwargs["step_num"], int) + assert "posted_steps" in c.kwargs + + def test_step_report_missing_posts_fallback_comment( + self, mock_dependencies, default_args + ): + mock_run, _, _, _ = mock_dependencies + mock_run.return_value = (True, f"Step output. {ALL_ISSUES_FIXED}", 0.1, "gpt-4") + + with patch( + "pdd.agentic_checkup_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + success, _, _, _ = run_agentic_checkup_orchestrator(**default_args) + + assert success is True + assert mock_post_once.call_count >= 1 + assert any( + "no `` block returned by agent" in c.kwargs["body"] + and "Raw output retained in workflow state" in c.kwargs["body"] + for c in mock_post_once.call_args_list + ) + + def test_post_exception_does_not_break_run(self, mock_dependencies, default_args): + mock_run, _, _, _ = mock_dependencies + mock_run.return_value = ( + True, + f"some report\n{ALL_ISSUES_FIXED}", + 0.1, + "gpt-4", + ) + + with patch( + "pdd.agentic_checkup_orchestrator.post_step_comment_once", + side_effect=RuntimeError("simulated gh failure"), + ): + success, _, _, _ = run_agentic_checkup_orchestrator(**default_args) + + assert success is True diff --git a/tests/test_agentic_common.py b/tests/test_agentic_common.py index bf8b23873..f223814a9 100644 --- a/tests/test_agentic_common.py +++ b/tests/test_agentic_common.py @@ -7192,3 +7192,303 @@ def test_reverts_out_of_scope_staged_rename(self, tmp_path): assert " -> " not in str(p), ( f"Fake combined path leaked into return value: {p!r}" ) + + +# --------------------------------------------------------------------------- +# Trusted step-comment helpers +# +# Orchestrators own per-step `## Step N/T:` comments via trusted PDD +# credentials. Step prompts emit `...` blocks; the +# orchestrator extracts them, sanitizes/truncates, and posts via +# `gh issue comment`. Three public helpers form the contract. +# --------------------------------------------------------------------------- + + +class TestExtractStepReport: + """Public helper: extract_step_report(text) -> Optional[str].""" + + def test_returns_inner_block_trimmed(self): + from pdd.agentic_common import extract_step_report + + body = ( + "preamble\n\n## Step 1/8: Discovery\n\n" + "Details here.\n\nFILES_MODIFIED: x.py" + ) + assert extract_step_report(body) == "## Step 1/8: Discovery\n\nDetails here." + + def test_returns_none_when_block_missing(self): + from pdd.agentic_common import extract_step_report + + assert extract_step_report("just some text no markers") is None + + def test_returns_none_on_empty_or_none(self): + from pdd.agentic_common import extract_step_report + + assert extract_step_report(None) is None + assert extract_step_report("") is None + + def test_returns_last_block_when_multiple(self): + from pdd.agentic_common import extract_step_report + + body = ( + "first\n" + "middle text\n" + "second body" + ) + assert extract_step_report(body) == "second body" + + def test_case_insensitive(self): + from pdd.agentic_common import extract_step_report + + body = "UPPERCASE" + assert extract_step_report(body) == "UPPERCASE" + + def test_multiline_dotall(self): + from pdd.agentic_common import extract_step_report + + body = "line1\nline2\nline3" + assert extract_step_report(body) == "line1\nline2\nline3" + + +class TestNormalizeStepCommentsState: + """Public helper: normalize_step_comments_state(raw) -> Set[int].""" + + def test_none_returns_empty_set(self): + from pdd.agentic_common import normalize_step_comments_state + + result = normalize_step_comments_state(None) + assert result == set() + assert isinstance(result, set) + + def test_list_of_ints_returns_set(self): + from pdd.agentic_common import normalize_step_comments_state + + assert normalize_step_comments_state([1, 2, 3]) == {1, 2, 3} + + def test_tuple_of_ints_returns_set(self): + from pdd.agentic_common import normalize_step_comments_state + + assert normalize_step_comments_state((4, 5, 6)) == {4, 5, 6} + + def test_set_returns_set_copy(self): + from pdd.agentic_common import normalize_step_comments_state + + src = {1, 2} + result = normalize_step_comments_state(src) + assert result == {1, 2} + result.add(99) + assert 99 not in src + + def test_list_of_int_strings_coerces(self): + from pdd.agentic_common import normalize_step_comments_state + + assert normalize_step_comments_state(["1", "2", "3"]) == {1, 2, 3} + + def test_legacy_dict_with_posted_flag_returns_set_of_int_keys(self): + """The legacy bug-orchestrator persisted step_comments as a dict: + ``{"1": {"posted": True}, "2": {"posted": False}}``. Coerce that + into the Set[int] shape, treating both ``posted`` and + ``failed_posted`` as signals that GitHub received a comment. + """ + from pdd.agentic_common import normalize_step_comments_state + + legacy = { + "1": {"posted": True}, + "2": {"posted": True, "fallback": True}, + "3": {"posted": False, "fallback_pending": True}, + "4": {"failed_posted": True, "failed_pending": False}, + } + result = normalize_step_comments_state(legacy) + assert result == {1, 2, 4} + + def test_legacy_dict_with_true_value(self): + from pdd.agentic_common import normalize_step_comments_state + + assert normalize_step_comments_state({"5": True, "6": False}) == {5} + + def test_malformed_entries_are_skipped(self): + from pdd.agentic_common import normalize_step_comments_state + + garbage = ["not-an-int", None, 5, "7", 9.5, True, {"posted": True}] + result = normalize_step_comments_state(garbage) + assert 5 in result + assert 7 in result + assert 9 not in result + assert "not-an-int" not in result + assert all(isinstance(v, int) for v in result) + + def test_empty_collections(self): + from pdd.agentic_common import normalize_step_comments_state + + assert normalize_step_comments_state([]) == set() + assert normalize_step_comments_state({}) == set() + assert normalize_step_comments_state(set()) == set() + + def test_garbage_input_returns_empty_set(self): + from pdd.agentic_common import normalize_step_comments_state + + assert normalize_step_comments_state(42) == set() + assert normalize_step_comments_state("string") == set() + + def test_frozenset_input(self): + from pdd.agentic_common import normalize_step_comments_state + + assert normalize_step_comments_state(frozenset({7, 8})) == {7, 8} + + +class TestPostStepCommentOnce: + """Public helper: post_step_comment_once(*, repo_owner, repo_name, + issue_number, step_num, body, posted_steps, cwd) -> bool. + + Idempotent: returns True immediately if step_num is already in + posted_steps. On success, mutates posted_steps in place. + """ + + def test_idempotent_skip_when_already_posted(self, tmp_path): + from pdd.agentic_common import post_step_comment_once + + posted = {1, 2, 3} + with patch("pdd.agentic_common.subprocess.run") as mock_run: + result = post_step_comment_once( + repo_owner="owner", + repo_name="repo", + issue_number=42, + step_num=2, + body="some report body", + posted_steps=posted, + cwd=tmp_path, + ) + assert result is True + assert mock_run.call_count == 0 + assert posted == {1, 2, 3} + + def test_posts_via_gh_and_mutates_set_on_success(self, tmp_path): + from pdd.agentic_common import post_step_comment_once + + posted = {1} + with patch("pdd.agentic_common._find_cli_binary", return_value="/usr/bin/gh"), \ + patch("pdd.agentic_common.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + result = post_step_comment_once( + repo_owner="owner", + repo_name="repo", + issue_number=42, + step_num=5, + body="## Step 5/8: Test\n\nAll green.", + posted_steps=posted, + cwd=tmp_path, + ) + assert result is True + assert 5 in posted + assert mock_run.call_count == 1 + args, kwargs = mock_run.call_args + assert args[0][:4] == ["gh", "issue", "comment", "42"] + assert "--repo" in args[0] + assert "owner/repo" in args[0] + body_index = args[0].index("--body") + assert "## Step 5/8: Test" in args[0][body_index + 1] + + def test_returns_false_and_does_not_mutate_on_gh_failure(self, tmp_path): + from pdd.agentic_common import post_step_comment_once + + posted = {1} + with patch("pdd.agentic_common._find_cli_binary", return_value="/usr/bin/gh"), \ + patch("pdd.agentic_common.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="rate limited") + result = post_step_comment_once( + repo_owner="owner", + repo_name="repo", + issue_number=42, + step_num=5, + body="body", + posted_steps=posted, + cwd=tmp_path, + ) + assert result is False + assert 5 not in posted + + def test_returns_false_when_gh_missing(self, tmp_path): + from pdd.agentic_common import post_step_comment_once + + posted = set() + with patch("pdd.agentic_common._find_cli_binary", return_value=None): + result = post_step_comment_once( + repo_owner="owner", + repo_name="repo", + issue_number=42, + step_num=5, + body="body", + posted_steps=posted, + cwd=tmp_path, + ) + assert result is False + assert posted == set() + + def test_redacts_known_secret_formats(self, tmp_path): + from pdd.agentic_common import post_step_comment_once + + secret_body = ( + "GH_TOKEN=ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n" + "google: AIzaSyA-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n" + "openai: sk-aaaaaaaaaaaaaaaaaaaaaaaa\n" + ) + with patch("pdd.agentic_common._find_cli_binary", return_value="/usr/bin/gh"), \ + patch("pdd.agentic_common.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + post_step_comment_once( + repo_owner="owner", + repo_name="repo", + issue_number=1, + step_num=1, + body=secret_body, + posted_steps=set(), + cwd=tmp_path, + ) + sent = mock_run.call_args[0][0] + body_index = sent.index("--body") + posted_body = sent[body_index + 1] + assert "ghp_aaaaaaaaaaaaaaaaaaaaaaaa" not in posted_body + assert "AIzaSyA-aaaaaaaaaaaaaaaaaaaa" not in posted_body + assert "sk-aaaaaaaaaaaaaaaaaaaaaaaa" not in posted_body + assert "[REDACTED" in posted_body + + def test_truncates_oversized_body(self, tmp_path): + from pdd.agentic_common import post_step_comment_once + + oversized = "A" * 200_000 + with patch("pdd.agentic_common._find_cli_binary", return_value="/usr/bin/gh"), \ + patch("pdd.agentic_common.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + post_step_comment_once( + repo_owner="owner", + repo_name="repo", + issue_number=1, + step_num=1, + body=oversized, + posted_steps=set(), + cwd=tmp_path, + ) + sent = mock_run.call_args[0][0] + body_index = sent.index("--body") + posted_body = sent[body_index + 1] + assert len(posted_body) < 100_000 + assert "truncated" in posted_body.lower() + + def test_exception_does_not_raise(self, tmp_path): + """Any subprocess exception is logged and surfaced as False.""" + from pdd.agentic_common import post_step_comment_once + + posted = set() + with patch("pdd.agentic_common._find_cli_binary", return_value="/usr/bin/gh"), \ + patch("pdd.agentic_common.subprocess.run", side_effect=OSError("boom")): + result = post_step_comment_once( + repo_owner="owner", + repo_name="repo", + issue_number=1, + step_num=1, + body="body", + posted_steps=posted, + cwd=tmp_path, + ) + assert result is False + assert posted == set() diff --git a/tests/test_agentic_e2e_fix_orchestrator.py b/tests/test_agentic_e2e_fix_orchestrator.py index 0f6a02d37..2fbd4667b 100644 --- a/tests/test_agentic_e2e_fix_orchestrator.py +++ b/tests/test_agentic_e2e_fix_orchestrator.py @@ -9697,6 +9697,596 @@ def test_prompt_documents_step2_reverification_on_resume(self): assert "deferred post-step-9 success" in section_body_lower +# --------------------------------------------------------------------------- +# Trusted step-comment wiring +# --------------------------------------------------------------------------- + + +class TestTrustedStepCommentPosting: + """The e2e_fix orchestrator must extract from each successful + step's output and post via post_step_comment_once with cycle-keyed + composite ints (`cycle * 10000 + step_num`).""" + + def test_success_path_posts_step_comment_once( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(instruction, cwd, *, verbose=False, quiet=False, label="", + timeout=None, max_retries=1, retry_delay=5.0, deadline=None, + use_playwright=False, reasoning_time=None): + if "step3" in label: + return ( + True, + "step 3 report\nNOT_A_BUG", + 0.1, "gpt-4", + ) + return (True, f"step report {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + with patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args) + + assert mock_post_once.call_count >= 1 + for c in mock_post_once.call_args_list: + assert "step_num" in c.kwargs + assert isinstance(c.kwargs["step_num"], int) + assert "posted_steps" in c.kwargs + + def test_step_report_missing_posts_fallback_comment( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(instruction, cwd, *, verbose=False, quiet=False, label="", + timeout=None, max_retries=1, retry_delay=5.0, deadline=None, + use_playwright=False, reasoning_time=None): + if "step3" in label: + return (True, "NOT_A_BUG", 0.1, "gpt-4") + return (True, f"Output for {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + with patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args) + + assert mock_post_once.call_count >= 1 + assert any( + "no `` block returned by agent" in c.kwargs["body"] + and "Raw output retained in workflow state" in c.kwargs["body"] + for c in mock_post_once.call_args_list + ) + + def test_post_exception_does_not_break_run( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(instruction, cwd, *, verbose=False, quiet=False, label="", + timeout=None, max_retries=1, retry_delay=5.0, deadline=None, + use_playwright=False, reasoning_time=None): + if "step3" in label: + return ( + True, + "step 3 report\nNOT_A_BUG", + 0.1, "gpt-4", + ) + return (True, f"step report {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + with patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + side_effect=RuntimeError("simulated gh failure"), + ): + # The run completes (success may be False due to NOT_A_BUG), but + # the helper exception must not propagate. + run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args) + + def test_step10_ci_path_posts_trusted_comment( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + """Regression for Greg's PR review (round 2). + + Step 10 (CI validation) runs OUTSIDE the linear `for step_num in + range(1, 10)` loop, so it must post its trusted step-comment + separately. The composite key follows the same encoding as the + in-loop steps: ``current_cycle * 10000 + 10`` (so cycle 1 step 10 + keys to 10010). + """ + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if "step9" in label: + return (True, "ALL_TESTS_PASS", 0.1, "gpt-4") + return (True, f"{label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + e2e_fix_default_args["issue_url"] = "https://github.com/owner/repo/issues/1" + e2e_fix_default_args["skip_cleanup"] = True + + with patch( + "pdd.agentic_e2e_fix_orchestrator._verify_tests_independently", + return_value=(True, "1 passed"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._extract_test_files", + return_value=["tests/test_foo.py"], + ), patch( + "pdd.agentic_e2e_fix_orchestrator.run_ci_validation_loop", + return_value=(True, "Required CI checks passed", 0.0), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._find_open_pr_number", + return_value=77, + create=True, + ), patch( + "pdd.agentic_e2e_fix_orchestrator._fetch_pr_head_sha", + return_value="aaaaaaaa", + create=True, + ), patch( + "pdd.agentic_checkup.run_agentic_checkup", + return_value=(True, "checkup ok", 0.0, "fake-model"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args) + + step_nums = [c.kwargs.get("step_num") for c in mock_post_once.call_args_list] + assert 10010 in step_nums, ( + "Step 10 CI-validation trusted post (cycle 1 * 10000 + 10) was " + f"not made. Saw keys: {step_nums}" + ) + # Confirm the body uses the orchestrator-format header so we know it + # came through the trusted path and not from a stray legacy site. + assert any( + c.kwargs.get("step_num") == 10010 + and c.kwargs.get("body", "").startswith("## Step 10/11:") + for c in mock_post_once.call_args_list + ), "Step 10 trusted post is missing the '## Step 10/11:' header" + + def test_step11_cleanup_path_posts_trusted_comment( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + """Regression for Greg's PR review (round 2). + + Step 11 (code cleanup) is a post-loop helper. Its trusted + step-comment must be posted from the call site once the helper + returns, keyed by ``current_cycle * 10000 + 11`` (so cycle 1 + keys to 10011), and must use the ``## Step 11/11:`` header so + downstream tooling can recognize it as orchestrator-owned. + """ + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if "step9" in label: + return (True, "ALL_TESTS_PASS", 0.1, "gpt-4") + return (True, f"{label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + e2e_fix_default_args["issue_url"] = "https://github.com/owner/repo/issues/1" + e2e_fix_default_args["skip_ci"] = True + + with patch( + "pdd.agentic_e2e_fix_orchestrator._verify_tests_independently", + return_value=(True, "1 passed"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._extract_test_files", + return_value=["tests/test_foo.py"], + ), patch( + "pdd.agentic_e2e_fix_orchestrator._run_step11_code_cleanup", + return_value=(0.0, ["module.py"]), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args) + + step_nums = [c.kwargs.get("step_num") for c in mock_post_once.call_args_list] + assert 10011 in step_nums, ( + "Step 11 cleanup trusted post (cycle 1 * 10000 + 11) was not " + f"made. Saw keys: {step_nums}" + ) + assert any( + c.kwargs.get("step_num") == 10011 + and c.kwargs.get("body", "").startswith("## Step 11/11:") + for c in mock_post_once.call_args_list + ), "Step 11 trusted post is missing the '## Step 11/11:' header" + + def test_step11_post_persists_step_comments_before_ci_failure_return( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + """Regression for Greg's PR review (round 3 — idempotency). + + After Step 11 (cleanup) posts its trusted comment, the orchestrator + MUST save workflow state so the composite key + ``current_cycle * 10000 + 11`` is durable. Otherwise, when + ``run_ci_validation_loop`` returns ``ci_success=False`` and the + orchestrator returns at the CI-failure path WITHOUT touching state, + a later resume rehydrates ``step_comments_set`` from disk, misses + the Step 11 key, and re-posts the same comment. + + The assertion: at least one ``save_workflow_state`` call between + Step 11 post and CI return must carry the Step 11 key + (``10011``) in its ``state`` argument's ``step_comments`` list. + """ + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if "step9" in label: + return (True, "ALL_TESTS_PASS", 0.1, "gpt-4") + return (True, f"{label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + e2e_fix_default_args["issue_url"] = "https://github.com/owner/repo/issues/1" + + # Faithfully simulate post_step_comment_once mutating ``posted_steps`` + # the way the real helper does (it adds ``step_num`` on success). A + # bare ``return_value=True`` would leave the in-memory set empty and + # mask the persistence path the orchestrator must take. + def _add_key(**kwargs): + kwargs["posted_steps"].add(kwargs["step_num"]) + return True + + with patch( + "pdd.agentic_e2e_fix_orchestrator._verify_tests_independently", + return_value=(True, "1 passed"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._extract_test_files", + return_value=["tests/test_foo.py"], + ), patch( + "pdd.agentic_e2e_fix_orchestrator._run_step11_code_cleanup", + return_value=(0.0, ["module.py"]), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.run_ci_validation_loop", + return_value=(False, "Required CI checks failed", 0.0), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + side_effect=_add_key, + ), patch( + "pdd.agentic_e2e_fix_orchestrator.save_workflow_state", + return_value=None, + ) as mock_save: + success, _, _, _, _ = run_agentic_e2e_fix_orchestrator( + **e2e_fix_default_args + ) + + assert success is False, "CI failure must propagate as success=False" + # Look for a save_workflow_state call whose state dict's + # `step_comments` includes the Step 11 key (10011). save signature: + # save_workflow_state(cwd, issue_number, workflow_type, state, ...) + # so the state dict is positional arg index 3. + step11_persisted = any( + 10011 in ( + call.args[3] if len(call.args) > 3 else call.kwargs.get("state", {}) + ).get("step_comments", []) + for call in mock_save.call_args_list + ) + assert step11_persisted, ( + "Step 11 cleanup trusted post must persist step_comments via " + "save_workflow_state before any CI-failure return so the " + "composite key 10011 is durable across resume/retry. " + f"save_workflow_state was called {mock_save.call_count} times but " + "none carried 10011 in their state['step_comments']." + ) + + def test_step10_post_persists_step_comments_before_checkup_failure_return( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + """Regression for Greg's PR review (round 3 — idempotency). + + After Step 10 (CI validation) posts its trusted comment on + ``ci_success=True``, the orchestrator MUST save workflow state so + the composite key ``current_cycle * 10000 + 10`` is durable. + Otherwise, when ``_run_final_checkup_on_pr`` returns + ``checkup_success=False`` and the orchestrator returns at the + final-checkup-failure path, a later resume re-posts the Step 10 + comment (and likely Step 11 too). + + Assertion: at least one ``save_workflow_state`` call between + Step 10 post and the checkup-failure return must carry the Step 10 + key (``10010``) in its ``state``'s ``step_comments``. + """ + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if "step9" in label: + return (True, "ALL_TESTS_PASS", 0.1, "gpt-4") + return (True, f"{label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + e2e_fix_default_args["issue_url"] = "https://github.com/owner/repo/issues/1" + e2e_fix_default_args["skip_cleanup"] = True + + def _add_key(**kwargs): + kwargs["posted_steps"].add(kwargs["step_num"]) + return True + + with patch( + "pdd.agentic_e2e_fix_orchestrator._verify_tests_independently", + return_value=(True, "1 passed"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._extract_test_files", + return_value=["tests/test_foo.py"], + ), patch( + "pdd.agentic_e2e_fix_orchestrator.run_ci_validation_loop", + return_value=(True, "Required CI checks passed", 0.0), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._find_open_pr_number", + return_value=77, + create=True, + ), patch( + "pdd.agentic_e2e_fix_orchestrator._fetch_pr_head_sha", + return_value="aaaaaaaa", + create=True, + ), patch( + "pdd.agentic_checkup.run_agentic_checkup", + return_value=(False, "checkup failed", 0.0, "fake-model"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + side_effect=_add_key, + ), patch( + "pdd.agentic_e2e_fix_orchestrator.save_workflow_state", + return_value=None, + ) as mock_save: + success, _, _, _, _ = run_agentic_e2e_fix_orchestrator( + **e2e_fix_default_args + ) + + assert success is False, "Final-checkup failure must propagate" + step10_persisted = any( + 10010 in ( + call.args[3] if len(call.args) > 3 else call.kwargs.get("state", {}) + ).get("step_comments", []) + for call in mock_save.call_args_list + ) + assert step10_persisted, ( + "Step 10 CI trusted post must persist step_comments via " + "save_workflow_state before any final-checkup-failure return so " + "the composite key 10010 is durable across resume/retry. " + f"save_workflow_state was called {mock_save.call_count} times but " + "none carried 10010 in their state['step_comments']." + ) + + def test_step11_persists_on_success_fall_through_resume_path( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + """Regression for Greg's PR review (round 4 — resume hole). + + On the SUCCESS_FALL_THROUGH resume path, the cached Step 9 output + already says ``ALL_TESTS_PASS``, the orchestrator marks + ``success = True`` BEFORE the outer ``while`` loop, and the inner + ``for step_num in range(1, 10)`` loop never runs. That loop is + the only place that initializes the inner-loop ``state_data`` + dict, so any post-loop reference to it would raise + ``UnboundLocalError`` (and the broad try/except around the post + would swallow the failure as a "post_step_comment_once failed" + warning, leaving the composite key unpersisted). + + Assert that even on this resume path, the post-loop Step 11 + trusted-comment save still calls ``save_workflow_state`` with + ``state["step_comments"]`` containing key ``10011`` before any + CI-failure return. + """ + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if "step9" in label: + return (True, "ALL_TESTS_PASS", 0.1, "gpt-4") + return (True, f"{label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + # Resumed state representing "Step 9 already declared + # ALL_TESTS_PASS in the prior run". The resume code path checks + # ``last_completed_step == 9`` AND the cached Step 9 output + # contains a success token, then marks the run as a + # SUCCESS_FALL_THROUGH (the post-loop Step 11 / Step 10 path). + resumed_state = { + "workflow": "e2e_fix", + "issue_url": "https://github.com/owner/repo/issues/1", + "issue_number": 1, + "current_cycle": 1, + "last_completed_step": 9, + "step_outputs": { + "1": "Step 1 output", + "2": "Step 2 output", + "3": "Root cause: CODE_BUG in module.py", + "4": "Step 4 output", + "5": "Step 5 output", + "6": "Step 6 output", + "7": "Step 7 output", + "8": "Step 8 output", + "9": "ALL_TESTS_PASS — verification succeeded", + }, + "total_cost": 0.5, + "model_used": "gpt-4", + "changed_files": ["src/module.py"], + "dev_unit_states": {}, + "skipped_steps": {}, + # step_comments empty so the post-loop save MUST add 10011 + # for this assertion to pass. + "step_comments": [], + "initial_file_hashes": {"src/module.py": "originalhash"}, + "initial_sha": "deadbeef", + "last_saved_at": "2026-01-01T00:00:00", + } + + e2e_fix_default_args["issue_url"] = "https://github.com/owner/repo/issues/1" + # Set resume=True (default) and force the resume code path by + # making load_workflow_state return the cached "Step 9 success" + # state. + + def _add_key(**kwargs): + kwargs["posted_steps"].add(kwargs["step_num"]) + return True + + with patch( + "pdd.agentic_e2e_fix_orchestrator.load_workflow_state", + return_value=(resumed_state, None), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._verify_tests_independently", + return_value=(True, "1 passed"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._extract_test_files", + return_value=["tests/test_foo.py"], + ), patch( + "pdd.agentic_e2e_fix_orchestrator._run_step11_code_cleanup", + return_value=(0.0, ["module.py"]), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._commit_and_push", + return_value=(True, "ok"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.run_ci_validation_loop", + return_value=(False, "Required CI checks failed", 0.0), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + side_effect=_add_key, + ), patch( + "pdd.agentic_e2e_fix_orchestrator.save_workflow_state", + return_value=None, + ) as mock_save: + success, _, _, _, _ = run_agentic_e2e_fix_orchestrator( + **e2e_fix_default_args + ) + + # CI failure must propagate as success=False. + assert success is False + # And the Step 11 key MUST appear in at least one save_workflow_state + # call's state dict — that is the round-4 invariant Greg flagged. + step11_persisted = any( + 10011 in ( + call.args[3] if len(call.args) > 3 else call.kwargs.get("state", {}) + ).get("step_comments", []) + for call in mock_save.call_args_list + ) + assert step11_persisted, ( + "Step 11 cleanup trusted post must persist step_comments on the " + "SUCCESS_FALL_THROUGH resume path even though the inner loop " + "never ran. save_workflow_state was called " + f"{mock_save.call_count} times but none carried 10011 in their " + "state['step_comments']. If this fails on a pre-fix build, the " + "post is being silently dropped by an UnboundLocalError swallowed " + "in the post-step try/except." + ) + + def test_step10_persists_on_success_fall_through_resume_path( + self, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + """Same round-4 regression as above, but for the Step 10 CI post + on a SUCCESS_FALL_THROUGH resume path. The inner loop never runs, + but Step 10's trusted post must still persist its composite key + (``10010``) to saved state before any final-checkup-failure + return. + """ + mock_run, _, _ = e2e_fix_mock_dependencies + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if "step9" in label: + return (True, "ALL_TESTS_PASS", 0.1, "gpt-4") + return (True, f"{label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + resumed_state = { + "workflow": "e2e_fix", + "issue_url": "https://github.com/owner/repo/issues/1", + "issue_number": 1, + "current_cycle": 1, + "last_completed_step": 9, + "step_outputs": { + "1": "Step 1 output", + "2": "Step 2 output", + "3": "Root cause: CODE_BUG in module.py", + "4": "Step 4 output", + "5": "Step 5 output", + "6": "Step 6 output", + "7": "Step 7 output", + "8": "Step 8 output", + "9": "ALL_TESTS_PASS — verification succeeded", + }, + "total_cost": 0.5, + "model_used": "gpt-4", + "changed_files": ["src/module.py"], + "dev_unit_states": {}, + "skipped_steps": {}, + "step_comments": [], + "initial_file_hashes": {"src/module.py": "originalhash"}, + "initial_sha": "deadbeef", + "last_saved_at": "2026-01-01T00:00:00", + } + + e2e_fix_default_args["issue_url"] = "https://github.com/owner/repo/issues/1" + e2e_fix_default_args["skip_cleanup"] = True + + def _add_key(**kwargs): + kwargs["posted_steps"].add(kwargs["step_num"]) + return True + + with patch( + "pdd.agentic_e2e_fix_orchestrator.load_workflow_state", + return_value=(resumed_state, None), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._verify_tests_independently", + return_value=(True, "1 passed"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._extract_test_files", + return_value=["tests/test_foo.py"], + ), patch( + "pdd.agentic_e2e_fix_orchestrator._commit_and_push", + return_value=(True, "ok"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.run_ci_validation_loop", + return_value=(True, "Required CI checks passed", 0.0), + ), patch( + "pdd.agentic_e2e_fix_orchestrator._find_open_pr_number", + return_value=77, + create=True, + ), patch( + "pdd.agentic_e2e_fix_orchestrator._fetch_pr_head_sha", + return_value="aaaaaaaa", + create=True, + ), patch( + "pdd.agentic_checkup.run_agentic_checkup", + return_value=(False, "checkup failed", 0.0, "fake-model"), + ), patch( + "pdd.agentic_e2e_fix_orchestrator.post_step_comment_once", + side_effect=_add_key, + ), patch( + "pdd.agentic_e2e_fix_orchestrator.save_workflow_state", + return_value=None, + ) as mock_save: + success, _, _, _, _ = run_agentic_e2e_fix_orchestrator( + **e2e_fix_default_args + ) + + assert success is False + step10_persisted = any( + 10010 in ( + call.args[3] if len(call.args) > 3 else call.kwargs.get("state", {}) + ).get("step_comments", []) + for call in mock_save.call_args_list + ) + assert step10_persisted, ( + "Step 10 CI trusted post must persist step_comments on the " + "SUCCESS_FALL_THROUGH resume path. save_workflow_state was " + f"called {mock_save.call_count} times but none carried 10010 " + "in their state['step_comments']." + ) + + class TestFinalCheckupForwardsCwd: def test_run_final_checkup_passes_cwd_to_run_agentic_checkup(self, tmp_path): from pdd.agentic_e2e_fix_orchestrator import _run_final_checkup_on_pr diff --git a/tests/test_agentic_test_orchestrator.py b/tests/test_agentic_test_orchestrator.py index 1afc7972c..d3be30f4b 100644 --- a/tests/test_agentic_test_orchestrator.py +++ b/tests/test_agentic_test_orchestrator.py @@ -961,3 +961,125 @@ def test_test_orchestrator_check_hard_stop_patterns_functional(): assert result is None, ( "Test orchestrator step 3 casual mention must not trigger without STOP_CONDITION tag" ) + + +# --------------------------------------------------------------------------- +# Trusted step-comment wiring +# --------------------------------------------------------------------------- + + +class TestTrustedStepCommentPosting: + """The test orchestrator must extract from each successful + step's output and post via post_step_comment_once with a composite-key + encoding (fractional step 5.5 and iterated manual-testing loop).""" + + def test_success_path_posts_step_comment_once(self, mock_deps, default_args): + mocks = mock_deps + + def side_effect(instruction, cwd, *, verbose=False, quiet=False, label="", + timeout=None, max_retries=1, retry_delay=5.0, deadline=None, + use_playwright=False): + if label == "step4": + return ( + True, + "step 4\nTEST_TYPE: api\nTEST_FRAMEWORK: pytest", + 0.1, "anthropic", + ) + if label == "step12": + return ( + True, + "step 12\nFILES_CREATED: tests/test_api.py", + 0.1, "anthropic", + ) + if label == "step17": + return ( + True, + "step 17\nPR Created: https://github.com/o/r/pull/10", + 0.1, "anthropic", + ) + return (True, f"step report {label}", 0.1, "anthropic") + + mocks["run"].side_effect = side_effect + + with patch( + "pdd.agentic_test_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + success, _, _, _, _ = run_agentic_test_orchestrator(**default_args) + + assert success is True + assert mock_post_once.call_count >= 10 + for c in mock_post_once.call_args_list: + assert "step_num" in c.kwargs + assert isinstance(c.kwargs["step_num"], int) + assert "posted_steps" in c.kwargs + + def test_step_report_missing_posts_fallback_comment( + self, mock_deps, default_args + ): + """Steps that omit still get a visible fallback comment.""" + mocks = mock_deps + + def side_effect(instruction, cwd, *, verbose=False, quiet=False, label="", + timeout=None, max_retries=1, retry_delay=5.0, deadline=None, + use_playwright=False): + if label == "step4": + return (True, "TEST_TYPE: api", 0.1, "anthropic") + if label == "step12": + return (True, "FILES_CREATED: tests/test_api.py", 0.1, "anthropic") + if label == "step17": + return (True, "PR Created: https://github.com/o/r/pull/10", 0.1, "anthropic") + return (True, f"Output for {label} (no report block)", 0.1, "anthropic") + + mocks["run"].side_effect = side_effect + + with patch( + "pdd.agentic_test_orchestrator.post_step_comment_once", + return_value=True, + ) as mock_post_once: + success, _, _, _, _ = run_agentic_test_orchestrator(**default_args) + + assert success is True + assert mock_post_once.call_count >= 10 + assert any( + "no `` block returned by agent" in c.kwargs["body"] + and "Raw output retained in workflow state" in c.kwargs["body"] + for c in mock_post_once.call_args_list + ) + + def test_post_exception_does_not_break_run(self, mock_deps, default_args): + """Exceptions raised by the helper must be log-and-continue.""" + mocks = mock_deps + + def side_effect(instruction, cwd, *, verbose=False, quiet=False, label="", + timeout=None, max_retries=1, retry_delay=5.0, deadline=None, + use_playwright=False): + if label == "step4": + return ( + True, + "step 4\nTEST_TYPE: api", + 0.1, "anthropic", + ) + if label == "step12": + return ( + True, + "step 12\nFILES_CREATED: tests/test_api.py", + 0.1, "anthropic", + ) + if label == "step17": + return ( + True, + "step 17\nPR Created: https://github.com/o/r/pull/10", + 0.1, "anthropic", + ) + return (True, f"step report {label}", 0.1, "anthropic") + + mocks["run"].side_effect = side_effect + + with patch( + "pdd.agentic_test_orchestrator.post_step_comment_once", + side_effect=RuntimeError("simulated gh failure"), + ): + success, _, _, _, _ = run_agentic_test_orchestrator(**default_args) + + assert success is True diff --git a/tests/test_post_step_comment_once_real.py b/tests/test_post_step_comment_once_real.py new file mode 100644 index 000000000..021e82f89 --- /dev/null +++ b/tests/test_post_step_comment_once_real.py @@ -0,0 +1,124 @@ +"""Real-LLM / real-network test for ``post_step_comment_once``. + +This test actually shells out to ``gh issue comment`` to post a real +comment on a real GitHub issue, then deletes it. Skipped by default; +to run: + + PDD_RUN_REAL_LLM_TESTS=1 PDD_REAL_GH_ISSUE_URL=https://github.com///issues/ \\ + pytest tests/test_post_step_comment_once_real.py -m real + +Requires ``gh`` CLI on PATH and ``GH_TOKEN`` / authenticated session. +""" +from __future__ import annotations + +import os +import re +import subprocess +from pathlib import Path + +import pytest + + +_REAL_FLAG = os.environ.get("PDD_RUN_REAL_LLM_TESTS") == "1" +_REAL_URL = os.environ.get("PDD_REAL_GH_ISSUE_URL", "") + +_URL_RE = re.compile( + r"https://github\.com/(?P[^/]+)/(?P[^/]+)/issues/(?P\d+)" +) + + +def _parse_url(url: str): + m = _URL_RE.match(url) + if not m: + return None + return m.group("owner"), m.group("repo"), int(m.group("num")) + + +def _gh_on_path() -> bool: + try: + return subprocess.run( + ["which", "gh"], capture_output=True, text=True + ).returncode == 0 + except (FileNotFoundError, OSError): + return False + + +@pytest.mark.real +@pytest.mark.skipif( + not _REAL_FLAG, + reason="real-network test gated by PDD_RUN_REAL_LLM_TESTS=1", +) +@pytest.mark.skipif( + not _REAL_URL, + reason="requires PDD_REAL_GH_ISSUE_URL to point at a writable issue", +) +@pytest.mark.skipif(not _gh_on_path(), reason="gh CLI not on PATH") +def test_real_post_step_comment_once_round_trip(tmp_path: Path) -> None: + """Post a real comment, find it, delete it. End-to-end.""" + parsed = _parse_url(_REAL_URL) + if parsed is None: + pytest.skip(f"PDD_REAL_GH_ISSUE_URL not a valid issue URL: {_REAL_URL!r}") + owner, repo, issue_number = parsed + + from pdd.agentic_common import post_step_comment_once + + posted_steps: set = set() + body = "## Trusted-comment helper round-trip test\n\nIf you see this, the helper works. (Auto-deletion follows.)" + + result = post_step_comment_once( + repo_owner=owner, + repo_name=repo, + issue_number=issue_number, + step_num=999_999, + body=body, + posted_steps=posted_steps, + cwd=tmp_path, + ) + + assert result is True, "post_step_comment_once returned False on real post" + assert 999_999 in posted_steps + + # Idempotency: a second call with the same step_num is a no-op. + result_again = post_step_comment_once( + repo_owner=owner, + repo_name=repo, + issue_number=issue_number, + step_num=999_999, + body="DIFFERENT BODY — should not be posted", + posted_steps=posted_steps, + cwd=tmp_path, + ) + assert result_again is True + + # Discover the comment we just posted and delete it. We look at the + # most recent issue comments and find ours by body prefix. + list_proc = subprocess.run( + [ + "gh", "api", + f"repos/{owner}/{repo}/issues/{issue_number}/comments", + "--paginate", + "-q", + ".[] | select(.body | startswith(\"## Trusted-comment helper\")) | .id", + ], + capture_output=True, + text=True, + ) + if list_proc.returncode != 0: + pytest.fail( + f"Failed to list comments for cleanup: rc={list_proc.returncode} " + f"stderr={list_proc.stderr!r}" + ) + comment_ids = [line.strip() for line in list_proc.stdout.splitlines() if line.strip()] + assert comment_ids, "Expected at least one matching comment to delete" + + for cid in comment_ids: + subprocess.run( + [ + "gh", "api", + f"repos/{owner}/{repo}/issues/comments/{cid}", + "-X", "DELETE", + ], + capture_output=True, + text=True, + check=False, + )