Skip to content

Commit b53cde9

Browse files
author
bgagent
committed
feat(agent): add in-pipeline pre-PR self-review phase (#262)
Adds an optional self-review phase between agent execution and post-hooks where the LLM critiques its own cumulative diff before the PR is created. This improves first-pass PR quality by catching bugs, style issues, and test gaps before human review. - New self_review.py orchestration module with run_self_review() - New prompts/self_review.py with focused review prompt template - TaskConfig extended with self_review_enabled and self_review_max_turns - Fields threaded through build_config, get_config, server, pipeline - Fail-open: self-review errors never block PR creation - Uses remaining turns/budget from original allocation (capped) - Feature is opt-in (disabled by default)
1 parent e3abe92 commit b53cde9

8 files changed

Lines changed: 595 additions & 0 deletions

File tree

agent/src/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,8 @@ def build_config(
337337
initial_approval_gate_count: int = 0,
338338
approval_gate_cap: int | None = None,
339339
attachments: list[dict] | None = None,
340+
self_review_enabled: bool = False,
341+
self_review_max_turns: int = 5,
340342
) -> TaskConfig:
341343
"""Build and validate configuration from explicit parameters.
342344
@@ -407,6 +409,8 @@ def build_config(
407409
initial_approval_gate_count=initial_approval_gate_count,
408410
approval_gate_cap=approval_gate_cap,
409411
attachments=validated_attachments,
412+
self_review_enabled=self_review_enabled,
413+
self_review_max_turns=self_review_max_turns,
410414
)
411415

412416

@@ -431,6 +435,9 @@ def get_config() -> TaskConfig:
431435
# an unreachable ``traces//`` key.
432436
trace=os.environ.get("TRACE", "").lower() in ("1", "true", "yes"),
433437
user_id=os.environ.get("USER_ID", ""),
438+
self_review_enabled=os.environ.get("SELF_REVIEW_ENABLED", "").lower()
439+
in ("1", "true", "yes"),
440+
self_review_max_turns=int(os.environ.get("SELF_REVIEW_MAX_TURNS", "5")),
434441
)
435442
except ValueError as e:
436443
print(f"ERROR: {e}", file=sys.stderr)

agent/src/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,9 @@ class TaskConfig(BaseModel):
186186
# Attachments from the orchestrator payload (Phase 3). Validated as
187187
# AttachmentConfig models. Empty list for tasks without attachments.
188188
attachments: list[AttachmentConfig] = Field(default_factory=list)
189+
# Self-review: optional LLM diff critique before PR creation.
190+
self_review_enabled: bool = False
191+
self_review_max_turns: int = 5 # Cap on turns allocated to self-review
189192

190193
@model_validator(mode="after")
191194
def _validate_trace_requires_user_id(self) -> Self:

agent/src/pipeline.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from progress_writer import _ProgressWriter
3030
from prompt_builder import build_system_prompt, discover_project_config
3131
from runner import run_agent
32+
from self_review import run_self_review
3233
from shell import log, log_error_cw
3334
from system_prompt import SYSTEM_PROMPT
3435
from telemetry import (
@@ -279,6 +280,8 @@ def run_task(
279280
trace: bool = False,
280281
user_id: str = "",
281282
attachments: list[dict] | None = None,
283+
self_review_enabled: bool = False,
284+
self_review_max_turns: int = 5,
282285
) -> dict:
283286
"""Run the full agent pipeline and return a serialized result dict.
284287
@@ -318,6 +321,8 @@ def run_task(
318321
initial_approval_gate_count=initial_approval_gate_count,
319322
approval_gate_cap=approval_gate_cap,
320323
attachments=attachments,
324+
self_review_enabled=self_review_enabled,
325+
self_review_max_turns=self_review_max_turns,
321326
)
322327

323328
# Inject Cedar policies into config for the PolicyEngine in runner.py
@@ -623,6 +628,22 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
623628
"turns_attempted": agent_result.num_turns or agent_result.turns,
624629
}
625630

631+
# Self-review phase: LLM critiques its own diff before PR creation.
632+
# Runs between cancel-check and post-hooks. Fail-open: errors here
633+
# never block PR creation.
634+
with task_span("task.self_review"):
635+
review_result = run_self_review(
636+
config, setup, agent_result, trajectory, progress
637+
)
638+
if review_result is not None:
639+
# Accumulate turns and cost from the review phase
640+
agent_result.turns += review_result.turns
641+
agent_result.num_turns += review_result.num_turns or review_result.turns
642+
if review_result.cost_usd is not None:
643+
agent_result.cost_usd = (
644+
(agent_result.cost_usd or 0.0) + review_result.cost_usd
645+
)
646+
626647
# Post-hooks (agent_result is guaranteed set by the try/except above)
627648
with task_span("task.post_hooks") as post_span:
628649
# Safety net: commit any uncommitted tracked changes (skip for read-only tasks)

agent/src/prompts/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .new_task import NEW_TASK_WORKFLOW
55
from .pr_iteration import PR_ITERATION_WORKFLOW
66
from .pr_review import PR_REVIEW_WORKFLOW
7+
from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT
78

89
_PROMPTS = {
910
"new_task": BASE_PROMPT.replace("{workflow}", NEW_TASK_WORKFLOW),

agent/src/prompts/self_review.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Self-review prompt template for pre-PR diff critique."""
2+
3+
SELF_REVIEW_PROMPT = """\
4+
You are reviewing your own work before it becomes a pull request. Below is the \
5+
cumulative diff of all changes on this branch compared to the base branch.
6+
7+
<diff>
8+
{diff}
9+
</diff>
10+
11+
## Task context
12+
13+
{task_description}
14+
15+
## Review checklist
16+
17+
Examine the diff carefully for:
18+
19+
1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \
20+
incorrect assumptions about data shapes or API contracts.
21+
2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \
22+
race conditions.
23+
3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \
24+
insecure defaults, OWASP Top 10 issues.
25+
4. **Style & consistency** — Naming conventions, code style violations relative to \
26+
the surrounding codebase, unnecessary complexity.
27+
5. **Test gaps** — Important behaviour that is untested, assertions that don't \
28+
verify the right thing, missing edge-case coverage.
29+
30+
## Instructions
31+
32+
- If you find issues, fix them directly: edit the files, run the build/tests to \
33+
verify your fixes, and commit the changes.
34+
- If no issues are found, stop immediately — do not make changes for the sake of \
35+
making changes.
36+
- Do NOT refactor code that was not part of the original diff unless it has a \
37+
concrete bug or security issue.
38+
- Keep fixes minimal and focused. Each fix should be a separate commit with a \
39+
clear message.
40+
"""

agent/src/self_review.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
"""Self-review orchestration: LLM critiques its own diff before PR creation."""
2+
3+
from __future__ import annotations
4+
5+
import asyncio
6+
import subprocess
7+
from typing import TYPE_CHECKING
8+
9+
from prompts.self_review import SELF_REVIEW_PROMPT
10+
from shell import log
11+
12+
if TYPE_CHECKING:
13+
from models import AgentResult, RepoSetup, TaskConfig
14+
from progress_writer import _ProgressWriter
15+
from telemetry import _TrajectoryWriter
16+
17+
# Diff truncation limit (characters). Large diffs are cut at hunk boundaries.
18+
_MAX_DIFF_CHARS = 60_000
19+
20+
# Minimal system prompt for the self-review agent invocation.
21+
_REVIEW_SYSTEM_PROMPT = """\
22+
You are a code reviewer working inside the repository {repo_url} on branch {branch_name}.
23+
Your working directory is {repo_dir}.
24+
25+
You have full access to the filesystem and can run commands. Fix any issues you \
26+
find directly — edit files, run the build, and commit fixes. Keep changes minimal \
27+
and focused.
28+
29+
Do NOT open a pull request or push. Just fix issues and commit locally.
30+
"""
31+
32+
33+
def _get_diff(repo_dir: str, default_branch: str) -> str:
34+
"""Generate the cumulative diff of the branch vs origin/{default_branch}."""
35+
try:
36+
result = subprocess.run(
37+
["git", "diff", f"origin/{default_branch}...HEAD"],
38+
cwd=repo_dir,
39+
capture_output=True,
40+
text=True,
41+
timeout=60,
42+
)
43+
if result.returncode != 0:
44+
log("WARN", f"self_review: git diff failed (exit {result.returncode})")
45+
return ""
46+
return result.stdout
47+
except (subprocess.TimeoutExpired, OSError) as e:
48+
log("WARN", f"self_review: git diff error: {type(e).__name__}: {e}")
49+
return ""
50+
51+
52+
def _truncate_diff(diff: str, max_chars: int = _MAX_DIFF_CHARS) -> str:
53+
"""Truncate diff at a hunk boundary if it exceeds max_chars.
54+
55+
Cuts at the last complete hunk (line starting with '@@') that fits
56+
within the limit, appending a truncation notice.
57+
"""
58+
if len(diff) <= max_chars:
59+
return diff
60+
61+
# Find the last hunk header that starts before max_chars
62+
truncated = diff[:max_chars]
63+
last_hunk = truncated.rfind("\n@@")
64+
if last_hunk > 0:
65+
# Cut just before this hunk header
66+
truncated = truncated[:last_hunk]
67+
else:
68+
# No hunk boundary found — hard-cut at max_chars
69+
last_newline = truncated.rfind("\n")
70+
if last_newline > 0:
71+
truncated = truncated[:last_newline]
72+
73+
total_lines = diff.count("\n")
74+
kept_lines = truncated.count("\n")
75+
truncated += (
76+
f"\n\n... [diff truncated: showing ~{kept_lines} of ~{total_lines} lines; "
77+
f"{len(diff) - len(truncated)} chars omitted] ..."
78+
)
79+
return truncated
80+
81+
82+
def _build_review_system_prompt(config: TaskConfig, setup: RepoSetup) -> str:
83+
"""Build a minimal system prompt for the self-review agent."""
84+
return _REVIEW_SYSTEM_PROMPT.format(
85+
repo_url=config.repo_url,
86+
branch_name=setup.branch,
87+
repo_dir=setup.repo_dir,
88+
)
89+
90+
91+
def run_self_review(
92+
config: TaskConfig,
93+
setup: RepoSetup,
94+
agent_result: AgentResult,
95+
trajectory: _TrajectoryWriter,
96+
progress: _ProgressWriter,
97+
) -> AgentResult | None:
98+
"""Run the self-review phase: LLM critiques its own diff and fixes issues.
99+
100+
Returns the AgentResult from the review phase, or None if skipped.
101+
Fail-open: errors are logged but never block the pipeline.
102+
"""
103+
# Skip condition: feature disabled
104+
if not config.self_review_enabled:
105+
log("TASK", "self_review: disabled (self_review_enabled=False)")
106+
return None
107+
108+
# Skip condition: pr_review is read-only (no diff to review)
109+
if config.task_type == "pr_review":
110+
log("TASK", "self_review: skipped for pr_review task type")
111+
return None
112+
113+
# Compute remaining turns
114+
used_turns = agent_result.turns or 0
115+
remaining_turns = config.max_turns - used_turns
116+
review_turns = min(remaining_turns, config.self_review_max_turns)
117+
if review_turns <= 0:
118+
log("TASK", f"self_review: no remaining turns (used={used_turns}, max={config.max_turns})")
119+
return None
120+
121+
# Compute remaining budget
122+
review_budget: float | None = None
123+
if config.max_budget_usd is not None:
124+
used_cost = agent_result.cost_usd or 0.0
125+
remaining_budget = config.max_budget_usd - used_cost
126+
if remaining_budget <= 0:
127+
log(
128+
"TASK",
129+
f"self_review: no remaining budget "
130+
f"(used=${used_cost:.2f}, max=${config.max_budget_usd:.2f})",
131+
)
132+
return None
133+
review_budget = remaining_budget
134+
135+
# Get the diff
136+
diff = _get_diff(setup.repo_dir, setup.default_branch)
137+
if not diff.strip():
138+
log("TASK", "self_review: no diff found — skipping")
139+
return None
140+
141+
# Truncate if needed
142+
diff = _truncate_diff(diff)
143+
144+
# Build the review prompt
145+
task_desc = config.task_description or f"Issue #{config.issue_number}"
146+
user_prompt = SELF_REVIEW_PROMPT.format(diff=diff, task_description=task_desc)
147+
system_prompt = _build_review_system_prompt(config, setup)
148+
149+
# Build a modified config for the review run
150+
review_config = config.model_copy(
151+
update={
152+
"max_turns": review_turns,
153+
"max_budget_usd": review_budget,
154+
}
155+
)
156+
157+
log(
158+
"TASK",
159+
f"self_review: starting (turns={review_turns}, "
160+
f"budget={'$' + f'{review_budget:.2f}' if review_budget else 'unlimited'}, "
161+
f"diff_chars={len(diff)})",
162+
)
163+
progress.write_agent_milestone(
164+
"self_review_started",
165+
f"turns={review_turns} diff_chars={len(diff)}",
166+
)
167+
168+
try:
169+
from runner import run_agent
170+
171+
review_result = asyncio.run(
172+
run_agent(
173+
user_prompt,
174+
system_prompt,
175+
review_config,
176+
cwd=setup.repo_dir,
177+
trajectory=trajectory,
178+
)
179+
)
180+
except Exception as e:
181+
# Fail-open: self-review errors never block the pipeline
182+
log("WARN", f"self_review: agent execution failed: {type(e).__name__}: {e}")
183+
progress.write_agent_milestone(
184+
"self_review_complete",
185+
f"status=error error={type(e).__name__}: {e}",
186+
)
187+
return None
188+
189+
log(
190+
"TASK",
191+
f"self_review: complete (status={review_result.status}, "
192+
f"turns={review_result.turns}, cost=${review_result.cost_usd or 0:.4f})",
193+
)
194+
progress.write_agent_milestone(
195+
"self_review_complete",
196+
f"status={review_result.status} turns={review_result.turns}",
197+
)
198+
return review_result

agent/src/server.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,8 @@ def _run_task_background(
386386
user_id: str = "",
387387
workload_access_token: str = "",
388388
attachments: list[dict] | None = None,
389+
self_review_enabled: bool = False,
390+
self_review_max_turns: int = 5,
389391
) -> None:
390392
"""Run the agent task in a background thread."""
391393
global _background_pipeline_failed
@@ -469,6 +471,8 @@ def _run_task_background(
469471
trace=trace,
470472
user_id=user_id,
471473
attachments=attachments,
474+
self_review_enabled=self_review_enabled,
475+
self_review_max_turns=self_review_max_turns,
472476
)
473477
_background_pipeline_failed = False
474478
except Exception as e:
@@ -557,6 +561,9 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict:
557561
channel_source = inp.get("channel_source", "") or ""
558562
channel_metadata = inp.get("channel_metadata") or {}
559563
attachments = inp.get("attachments") or []
564+
# Self-review (opt-in): LLM critiques its own diff before PR creation.
565+
self_review_enabled = inp.get("self_review_enabled") is True
566+
self_review_max_turns = int(inp.get("self_review_max_turns", 5))
560567
# ``trace`` is strictly opt-in (design §10.1). Accept only real
561568
# booleans from the orchestrator — a string "false" would otherwise
562569
# flip the flag on.
@@ -630,6 +637,8 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict:
630637
"user_id": user_id,
631638
"workload_access_token": workload_access_token,
632639
"attachments": attachments,
640+
"self_review_enabled": self_review_enabled,
641+
"self_review_max_turns": self_review_max_turns,
633642
}
634643

635644

0 commit comments

Comments
 (0)