fix(docs): address CodeRabbit review findings on PR #5853

mohamedalishiha · claude · mohamedalishiha · commit a8e461da623c · 2026-05-18T20:16:29.000+02:00
Four targeted fixes responding to coderabbitai review on the docs PR:

1. examples/dspy_optimization.ipynb: replace assert statements for env var
   checks with explicit RuntimeError so the notebook fails immediately with
   a clear message even when Python is run with -O optimizations enabled.

2. examples/dspy_optimization.ipynb: use a held-out topic (HOLDOUT_TOPIC =
   customer webinar follow-up) for baseline and optimized crew comparison so
   the demo comparison is fair and does not overstate gains by using a topic
   that appears in the training set.

3. examples/dspy_optimization.ipynb: use os.getenv(JUDGE_MODEL, claude-haiku-4-5)
   stable alias instead of pinned dated version so the notebook keeps working
   across minor model updates; users can still pin via env var.

4. docs/en/concepts/dspy-optimization.mdx: same JUDGE_MODEL env var pattern
   with claude-haiku-4-5 stable alias in the LLM-judge metric code example.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docs/en/concepts/dspy-optimization.mdx b/docs/en/concepts/dspy-optimization.mdx
@@ -160,6 +160,9 @@ import anthropic
 import os
 
 _judge = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
+# Stable alias keeps the example working across minor model updates.
+# Set JUDGE_MODEL to pin a specific version (e.g. "claude-haiku-4-5-20251001") for reproducibility.
+_JUDGE_MODEL = os.getenv("JUDGE_MODEL", "claude-haiku-4-5")
 
 def llm_judge_metric(example: dspy.Example, prediction) -> float:
     """Score email quality 0.0–1.0 using Claude as a pairwise judge."""
@@ -173,7 +176,7 @@ def llm_judge_metric(example: dspy.Example, prediction) -> float:
             f"Reply with a single float between 0.0 (A much worse) and 1.0 (A much better). Nothing else."
         )
         resp = _judge.messages.create(
-            model="claude-haiku-4-5-20251001",
+            model=_JUDGE_MODEL,
             max_tokens=16,
             messages=[{"role": "user", "content": prompt}],
         )
diff --git a/examples/dspy_optimization.ipynb b/examples/dspy_optimization.ipynb
@@ -12,7 +12,7 @@
     "- How to define a crew and a labeled training set\n",
     "- How to write an LLM-judge metric\n",
     "- How to run `DSPyOptimizer.compile()` and inspect the result\n",
-    "- How to compare baseline vs. optimized crew output\n",
+    "- How to compare baseline vs. optimized crew output on a held-out topic\n",
     "\n",
     "**Prerequisites:**\n",
     "```bash\n",
@@ -37,9 +37,11 @@
     "from crewai import Agent, Task, Crew, LLM\n",
     "from crewai.optimizers import DSPyOptimizer\n",
     "\n",
-    "# Verify required keys are present before running expensive LLM calls\n",
-    "assert os.getenv(\"OPENAI_API_KEY\"), \"Set OPENAI_API_KEY in your environment\"\n",
-    "assert os.getenv(\"ANTHROPIC_API_KEY\"), \"Set ANTHROPIC_API_KEY in your environment\"\n",
+    "# Verify required keys are present before running expensive LLM calls.\n",
+    "# Raises RuntimeError immediately rather than failing midway through an optimization run.\n",
+    "missing = [k for k in (\"OPENAI_API_KEY\", \"ANTHROPIC_API_KEY\") if not os.getenv(k)]\n",
+    "if missing:\n",
+    "    raise RuntimeError(f\"Missing required environment variables: {', '.join(missing)}\")\n",
     "\n",
     "# Configure DSPy to use the same model as the crew (gpt-4o-mini is cost-efficient for optimization)\n",
     "dspy.configure(lm=dspy.LM(\"openai/gpt-4o-mini\", temperature=0.0))\n",
@@ -97,8 +99,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Baseline measurement: run the crew before optimization to establish the starting quality\n",
-    "baseline_output = crew.kickoff(inputs={\"topic\": \"Q1 earnings call\"})\n",
+    "# Baseline measurement: run the crew on a held-out topic before optimization.\n",
+    "# This topic does NOT appear in the training set so the comparison is fair.\n",
+    "HOLDOUT_TOPIC = \"customer webinar follow-up\"\n",
+    "baseline_output = crew.kickoff(inputs={\"topic\": HOLDOUT_TOPIC})\n",
     "\n",
     "print(\"=== BASELINE OUTPUT ===\")\n",
     "print(str(baseline_output))"
@@ -114,6 +118,10 @@
     "# Uses pairwise comparison with position-bias mitigation via order swap\n",
     "_judge_client = anthropic.Anthropic(api_key=os.environ[\"ANTHROPIC_API_KEY\"])\n",
     "\n",
+    "# Use the stable alias so the notebook keeps working across minor model updates.\n",
+    "# Override with JUDGE_MODEL env var if you need a specific pinned version.\n",
+    "_JUDGE_MODEL = os.getenv(\"JUDGE_MODEL\", \"claude-haiku-4-5\")\n",
+    "\n",
     "def _pairwise_score(output_a: str, output_b: str) -> float:\n",
     "    \"\"\"Ask Claude to rate A vs B on email quality criteria. Returns float in [0.0, 1.0].\"\"\"\n",
     "    prompt = (\n",
@@ -127,7 +135,7 @@
     "        \"Number only:\"\n",
     "    )\n",
     "    resp = _judge_client.messages.create(\n",
-    "        model=\"claude-haiku-4-5-20251001\",  # pinned version for reproducibility\n",
+    "        model=_JUDGE_MODEL,\n",
     "        max_tokens=16,\n",
     "        messages=[{\"role\": \"user\", \"content\": prompt}],\n",
     "    )\n",
@@ -148,7 +156,7 @@
     "    return (forward + reverse) / 2.0\n",
     "\n",
     "\n",
-    "print(\"Metric function defined.\")"
+    "print(f\"Metric function defined. Judge model: {_JUDGE_MODEL}\")"
    ]
   },
   {
@@ -157,8 +165,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Training set: 10 diverse email-drafting examples with realistic reference outputs\n",
-    "# These cover different email types so the optimizer learns generalizable improvements\n",
+    "# Training set: 10 diverse email-drafting examples with realistic reference outputs.\n",
+    "# All topics are distinct from HOLDOUT_TOPIC so the holdout stays unseen during training.\n",
     "trainset = [\n",
     "    dspy.Example(\n",
     "        inputs={\"topic\": \"Q1 earnings call\"},\n",
@@ -276,7 +284,7 @@
     "    ).with_inputs(\"topic\"),\n",
     "]\n",
     "\n",
-    "print(f\"Training set: {len(trainset)} examples covering diverse email types.\")"
+    "print(f\"Training set: {len(trainset)} examples. Holdout topic: '{HOLDOUT_TOPIC}'\")"
    ]
   },
   {
@@ -325,16 +333,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Run the optimized crew and compare with baseline output\n",
-    "optimized_output = result.crew.kickoff(inputs={\"topic\": \"Q1 earnings call\"})\n",
+    "# Run the optimized crew on the same held-out topic and compare against baseline.\n",
+    "# Using HOLDOUT_TOPIC (unseen during training) gives a fair, unbiased comparison.\n",
+    "optimized_output = result.crew.kickoff(inputs={\"topic\": HOLDOUT_TOPIC})\n",
     "\n",
     "print(\"=== BASELINE OUTPUT ===\")\n",
     "print(str(baseline_output))\n",
     "print()\n",
     "print(\"=== OPTIMIZED OUTPUT ===\")\n",
     "print(str(optimized_output))\n",
     "print()\n",
-    "print(f\"Score improvement: {result.score_delta:+.3f}\")"
+    "print(f\"Score improvement on training set: {result.score_delta:+.3f}\")"
    ]
   }
  ],