Add rich feedback mode to k_module_problem example

codelion · codelion · commit 93b798e76040 · 2025-12-24T10:52:19.000+05:30
Introduces a RICH_FEEDBACK=1 mode that provides detailed feedback on which modules are correct or incorrect, along with actionable hints. Updates the evaluator and iterative agent to support and display this feedback, and documents the new mode and its impact in the README.
diff --git a/examples/k_module_problem/README.md b/examples/k_module_problem/README.md
@@ -166,6 +166,25 @@ This establishes the "no learning" baseline. Any method that beats this is demon
 
 **Key insight**: While OpenEvolve takes more iterations on average (52.3 vs 13), it has a **100% success rate** compared to iterative refinement's 33%. The evolutionary approach's population diversity ensures it eventually escapes local optima that trap single-trajectory methods.
 
+### Rich Feedback Mode: Proving Attribution Matters
+
+To verify that feedback attribution is the key factor, we added a `RICH_FEEDBACK=1` mode that tells the agent exactly which modules are correct/incorrect:
+
+```bash
+RICH_FEEDBACK=1 python run_iterative_trials.py --trials 3 --iterations 100
+```
+
+| Method | Success Rate | Avg Iterations |
+|--------|-------------|----------------|
+| **Iterative (no feedback)** | 33% | 13 (when found) |
+| **Iterative (rich feedback)** | **100%** | **3** |
+
+With rich feedback, iterative refinement achieves **100% success rate in only 3 iterations** - dramatically faster than OpenEvolve's 52 iterations! This proves that:
+
+1. **Feedback attribution is the key factor**, not the optimization method
+2. When feedback is attributable, iterative refinement is highly effective
+3. Evolution is necessary when feedback is NOT attributable (you can't tell which component is wrong)
+
 ## Why This Matters
 
 This example illustrates when you should prefer evolutionary approaches:
diff --git a/examples/k_module_problem/evaluator.py b/examples/k_module_problem/evaluator.py
@@ -9,13 +9,21 @@
 This creates a challenging landscape for iterative refinement but
 allows evolutionary crossover to combine good "building blocks"
 from different individuals.
+
+Set RICH_FEEDBACK=1 to enable rich feedback mode, which tells you
+exactly which modules are correct/incorrect. This demonstrates that
+iterative refinement works well when feedback is attributable.
 """
 
+import os
 import sys
 import time
 import traceback
 import importlib.util
 
+# Rich feedback mode - when enabled, reveals which modules are correct
+RICH_FEEDBACK = os.environ.get("RICH_FEEDBACK", "0") == "1"
+
 # The correct solution (hidden from the optimizer)
 # This represents the "optimal" pipeline configuration discovered through
 # extensive testing/domain expertise
@@ -141,14 +149,34 @@ def score_config(config: dict) -> tuple:
 
 def build_artifacts(config: dict, correct_count: int, module_results: dict, eval_time: float) -> dict:
     """
-    Build artifacts that provide useful feedback without revealing
-    exactly which modules are correct.
+    Build artifacts that provide useful feedback.
+
+    In normal mode: Only reveals how many modules are correct, not which ones.
+    In rich feedback mode (RICH_FEEDBACK=1): Reveals exactly which modules are correct/incorrect.
     """
     artifacts = {}
 
     # Configuration summary
     artifacts["configuration"] = str(config)
 
+    # Rich feedback mode - reveals which modules are correct/incorrect
+    if RICH_FEEDBACK:
+        correct_modules = [m for m, is_correct in module_results.items() if is_correct]
+        incorrect_modules = [m for m, is_correct in module_results.items() if not is_correct]
+
+        artifacts["module_feedback"] = {
+            "correct": correct_modules,
+            "incorrect": incorrect_modules,
+        }
+
+        if incorrect_modules:
+            hints = []
+            for module in incorrect_modules:
+                hints.append(f"'{module}' is WRONG - try a different option from {VALID_OPTIONS[module]}")
+            artifacts["actionable_hints"] = hints
+        else:
+            artifacts["actionable_hints"] = ["All modules are correct!"]
+
     # Score feedback - tells you how many are correct, but not which ones
     if correct_count == NUM_MODULES:
         artifacts["status"] = "PERFECT! All modules correctly configured!"
diff --git a/examples/k_module_problem/iterative_agent.py b/examples/k_module_problem/iterative_agent.py
@@ -64,6 +64,26 @@ def write_program(program_path: str, code: str) -> None:
         f.write(code)
 
 
+def format_rich_feedback(artifacts: dict) -> str:
+    """Format rich feedback if available (RICH_FEEDBACK=1)."""
+    if "module_feedback" not in artifacts:
+        return ""
+
+    feedback = artifacts["module_feedback"]
+    hints = artifacts.get("actionable_hints", [])
+
+    result = "\n## DETAILED MODULE FEEDBACK (Rich Feedback Mode)\n"
+    result += f"- CORRECT modules: {feedback.get('correct', [])}\n"
+    result += f"- INCORRECT modules: {feedback.get('incorrect', [])}\n"
+
+    if hints:
+        result += "\n### Actionable Hints:\n"
+        for hint in hints:
+            result += f"- {hint}\n"
+
+    return result
+
+
 def create_improvement_prompt(
     current_code: str,
     metrics: dict,
@@ -108,6 +128,7 @@ def create_improvement_prompt(
 - Score: {metrics.get('combined_score', 0):.2%}
 - Status: {artifacts.get('status', 'N/A')}
 - Suggestion: {artifacts.get('suggestion', 'N/A')}
+{format_rich_feedback(artifacts)}
 {history_str}
 
 ## Your Task
@@ -205,7 +226,11 @@ def run_iterative_refinement(
 
         # Evaluate current program
         eval_result = evaluate(str(current_program_path))
-        metrics = eval_result.get("metrics", {})
+        # Handle both flat (success) and nested (error) return formats
+        if "metrics" in eval_result:
+            metrics = eval_result["metrics"]
+        else:
+            metrics = {k: v for k, v in eval_result.items() if k != "artifacts"}
         artifacts = eval_result.get("artifacts", {})
 
         score = metrics.get("combined_score", 0)