that-github-user · that-github-user · Mar 28, 2026 · Mar 28, 2026
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -111,14 +111,16 @@ Each agent receives a composite score:
 |--------|--------|-----------|
 | Tests pass | +100 | Strongest signal — code works |
 | Convergence group | +0 to +50 | group_score × 50 — consensus is confidence |
-| Smaller diff | +0 to +10 | (1 - normalized_size) × 10 — simpler is better |
+| Diff size outlier | +0 to +10 | Penalizes diffs > 2× median size — catches agents that went off the rails |
+
+Normal-sized and thorough diffs all receive the full 10 points. Only outlier-large diffs (more than 2× the median diff size across agents) are penalized proportionally: `max(0, 10 - (ratio - 2) × 5)` where `ratio = agent_lines / median_lines`.
 
 The agent with the highest total score is recommended. Ties broken by the first agent.
 
 ### Why these weights?
 - Tests (100) dominate because correctness trumps everything
 - Convergence (50) is secondary — agreement without tests is weaker evidence
-- Diff size (10) is a tiebreaker — among equally correct solutions, prefer the simpler one
+- Diff size (10) is a tiebreaker — only penalizes outlier-large diffs that suggest an agent went off the rails, rather than rewarding minimal changes
 
 ## Security Model
 

diff --git a/src/scoring/convergence.test.ts b/src/scoring/convergence.test.ts
@@ -159,15 +159,18 @@ describe("recommend", () => {
     assert.ok(result.recommended === 1 || result.recommended === 2);
   });
 
-  it("prefers smaller diffs as tiebreaker", () => {
+  it("penalizes outlier-large diffs (> 2x median) as tiebreaker", () => {
+    // Agent 1 has 70 lines (> 2x median of 35), agent 2 and 3 are normal
     const agents = [
-      makeAgent({ id: 1, diff: DIFF_A, linesAdded: 50, linesRemoved: 20 }),
-      makeAgent({ id: 2, diff: DIFF_A, linesAdded: 5, linesRemoved: 2 }),
+      makeAgent({ id: 1, diff: DIFF_A, linesAdded: 100, linesRemoved: 40 }),
+      makeAgent({ id: 2, diff: DIFF_A, linesAdded: 10, linesRemoved: 5 }),
+      makeAgent({ id: 3, diff: DIFF_A, linesAdded: 10, linesRemoved: 5 }),
     ];
     const convergence = analyzeConvergence(agents);
     const result = recommend(agents, [], convergence);
 
-    assert.equal(result.recommended, 2);
+    // Agents 2 and 3 are normal-sized and should be preferred over outlier agent 1
+    assert.ok(result.recommended === 2 || result.recommended === 3);
   });
 
   it("returns per-agent score breakdowns", () => {
@@ -204,10 +207,30 @@ describe("recommend", () => {
     );
   });
 
-  it("gives higher diffSizePoints to smaller diffs", () => {
+  it("gives equal diffSizePoints to non-outlier diffs", () => {
+    // Both agents are within 2x of median — should get the same score
     const agents = [
-      makeAgent({ id: 1, diff: DIFF_A, linesAdded: 50, linesRemoved: 20 }),
-      makeAgent({ id: 2, diff: DIFF_A, linesAdded: 5, linesRemoved: 2 }),
+      makeAgent({ id: 1, diff: DIFF_A, linesAdded: 20, linesRemoved: 10 }),
+      makeAgent({ id: 2, diff: DIFF_A, linesAdded: 10, linesRemoved: 5 }),
+    ];
+    const convergence = analyzeConvergence(agents);
+    const result = recommend(agents, [], convergence);
+
+    const score1 = result.scores.find((s) => s.agentId === 1);
+    const score2 = result.scores.find((s) => s.agentId === 2);
+    assert.ok(score1);
+    assert.ok(score2);
+
+    assert.equal(score1.diffSizePoints, score2.diffSizePoints);
+    assert.equal(score1.diffSizePoints, 10);
+  });
+
+  it("penalizes diffSizePoints for outlier-large diffs", () => {
+    // Agent 1 is > 2x median, so it gets penalized
+    const agents = [
+      makeAgent({ id: 1, diff: DIFF_A, linesAdded: 100, linesRemoved: 50 }),
+      makeAgent({ id: 2, diff: DIFF_A, linesAdded: 10, linesRemoved: 5 }),
+      makeAgent({ id: 3, diff: DIFF_A, linesAdded: 10, linesRemoved: 5 }),
     ];
     const convergence = analyzeConvergence(agents);
     const result = recommend(agents, [], convergence);
@@ -217,6 +240,8 @@ describe("recommend", () => {
     assert.ok(score1);
     assert.ok(score2);
 
-    assert.ok(score2.diffSizePoints > score1.diffSizePoints);
+    assert.ok(score1.diffSizePoints < score2.diffSizePoints);
+    assert.equal(score2.diffSizePoints, 10);
+    assert.ok(score1.diffSizePoints < 10);
   });
 });
diff --git a/src/scoring/convergence.ts b/src/scoring/convergence.ts
@@ -120,7 +120,7 @@ function clusterAgents(
 
 /**
  * Recommend the best agent based on test results and convergence.
- * Priority: passing tests > convergence group size > smaller diff.
+ * Priority: passing tests > convergence group size > diff size outlier penalty.
  */
 export function recommend(
   agents: AgentResult[],
@@ -130,6 +130,14 @@ export function recommend(
   const completed = agents.filter((a) => a.status === "success" && a.diff.length > 0);
   if (completed.length === 0) return { recommended: null, scores: [] };
 
+  // Compute median diff size for outlier detection
+  const sortedLines = completed.map((a) => a.linesAdded + a.linesRemoved).sort((a, b) => a - b);
+  const mid = Math.floor(sortedLines.length / 2);
+  const medianLines =
+    sortedLines.length % 2 === 0
+      ? (sortedLines[mid - 1]! + sortedLines[mid]!) / 2
+      : sortedLines[mid]!;
+
   const agentScores: AgentScore[] = [];
 
   for (const agent of completed) {
@@ -141,10 +149,10 @@ export function recommend(
     const group = convergence.find((g) => g.agents.includes(agent.id));
     const convergencePoints = group ? group.similarity * 50 : 0;
 
-    // Smaller diffs preferred (normalized)
-    const maxLines = Math.max(...completed.map((a) => a.linesAdded + a.linesRemoved), 1);
+    // Penalize outlier-large diffs (> 2x median), otherwise no penalty
     const agentLines = agent.linesAdded + agent.linesRemoved;
-    const diffSizePoints = (1 - agentLines / maxLines) * 10;
+    const ratio = medianLines > 0 ? agentLines / medianLines : 1;
+    const diffSizePoints = ratio > 2 ? Math.max(0, 10 - (ratio - 2) * 5) : 10;
 
     const total = testPoints + convergencePoints + diffSizePoints;