Skip to content

Commit a8e461d

Browse files
fix(docs): address CodeRabbit review findings on PR #5853
Four targeted fixes responding to coderabbitai review on the docs PR: 1. examples/dspy_optimization.ipynb: replace assert statements for env var checks with explicit RuntimeError so the notebook fails immediately with a clear message even when Python is run with -O optimizations enabled. 2. examples/dspy_optimization.ipynb: use a held-out topic (HOLDOUT_TOPIC = customer webinar follow-up) for baseline and optimized crew comparison so the demo comparison is fair and does not overstate gains by using a topic that appears in the training set. 3. examples/dspy_optimization.ipynb: use os.getenv(JUDGE_MODEL, claude-haiku-4-5) stable alias instead of pinned dated version so the notebook keeps working across minor model updates; users can still pin via env var. 4. docs/en/concepts/dspy-optimization.mdx: same JUDGE_MODEL env var pattern with claude-haiku-4-5 stable alias in the LLM-judge metric code example. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 1170e76 commit a8e461d

2 files changed

Lines changed: 27 additions & 15 deletions

File tree

docs/en/concepts/dspy-optimization.mdx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ import anthropic
160160
import os
161161

162162
_judge = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
163+
# Stable alias keeps the example working across minor model updates.
164+
# Set JUDGE_MODEL to pin a specific version (e.g. "claude-haiku-4-5-20251001") for reproducibility.
165+
_JUDGE_MODEL = os.getenv("JUDGE_MODEL", "claude-haiku-4-5")
163166

164167
def llm_judge_metric(example: dspy.Example, prediction) -> float:
165168
"""Score email quality 0.0–1.0 using Claude as a pairwise judge."""
@@ -173,7 +176,7 @@ def llm_judge_metric(example: dspy.Example, prediction) -> float:
173176
f"Reply with a single float between 0.0 (A much worse) and 1.0 (A much better). Nothing else."
174177
)
175178
resp = _judge.messages.create(
176-
model="claude-haiku-4-5-20251001",
179+
model=_JUDGE_MODEL,
177180
max_tokens=16,
178181
messages=[{"role": "user", "content": prompt}],
179182
)

examples/dspy_optimization.ipynb

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"- How to define a crew and a labeled training set\n",
1313
"- How to write an LLM-judge metric\n",
1414
"- How to run `DSPyOptimizer.compile()` and inspect the result\n",
15-
"- How to compare baseline vs. optimized crew output\n",
15+
"- How to compare baseline vs. optimized crew output on a held-out topic\n",
1616
"\n",
1717
"**Prerequisites:**\n",
1818
"```bash\n",
@@ -37,9 +37,11 @@
3737
"from crewai import Agent, Task, Crew, LLM\n",
3838
"from crewai.optimizers import DSPyOptimizer\n",
3939
"\n",
40-
"# Verify required keys are present before running expensive LLM calls\n",
41-
"assert os.getenv(\"OPENAI_API_KEY\"), \"Set OPENAI_API_KEY in your environment\"\n",
42-
"assert os.getenv(\"ANTHROPIC_API_KEY\"), \"Set ANTHROPIC_API_KEY in your environment\"\n",
40+
"# Verify required keys are present before running expensive LLM calls.\n",
41+
"# Raises RuntimeError immediately rather than failing midway through an optimization run.\n",
42+
"missing = [k for k in (\"OPENAI_API_KEY\", \"ANTHROPIC_API_KEY\") if not os.getenv(k)]\n",
43+
"if missing:\n",
44+
" raise RuntimeError(f\"Missing required environment variables: {', '.join(missing)}\")\n",
4345
"\n",
4446
"# Configure DSPy to use the same model as the crew (gpt-4o-mini is cost-efficient for optimization)\n",
4547
"dspy.configure(lm=dspy.LM(\"openai/gpt-4o-mini\", temperature=0.0))\n",
@@ -97,8 +99,10 @@
9799
"metadata": {},
98100
"outputs": [],
99101
"source": [
100-
"# Baseline measurement: run the crew before optimization to establish the starting quality\n",
101-
"baseline_output = crew.kickoff(inputs={\"topic\": \"Q1 earnings call\"})\n",
102+
"# Baseline measurement: run the crew on a held-out topic before optimization.\n",
103+
"# This topic does NOT appear in the training set so the comparison is fair.\n",
104+
"HOLDOUT_TOPIC = \"customer webinar follow-up\"\n",
105+
"baseline_output = crew.kickoff(inputs={\"topic\": HOLDOUT_TOPIC})\n",
102106
"\n",
103107
"print(\"=== BASELINE OUTPUT ===\")\n",
104108
"print(str(baseline_output))"
@@ -114,6 +118,10 @@
114118
"# Uses pairwise comparison with position-bias mitigation via order swap\n",
115119
"_judge_client = anthropic.Anthropic(api_key=os.environ[\"ANTHROPIC_API_KEY\"])\n",
116120
"\n",
121+
"# Use the stable alias so the notebook keeps working across minor model updates.\n",
122+
"# Override with JUDGE_MODEL env var if you need a specific pinned version.\n",
123+
"_JUDGE_MODEL = os.getenv(\"JUDGE_MODEL\", \"claude-haiku-4-5\")\n",
124+
"\n",
117125
"def _pairwise_score(output_a: str, output_b: str) -> float:\n",
118126
" \"\"\"Ask Claude to rate A vs B on email quality criteria. Returns float in [0.0, 1.0].\"\"\"\n",
119127
" prompt = (\n",
@@ -127,7 +135,7 @@
127135
" \"Number only:\"\n",
128136
" )\n",
129137
" resp = _judge_client.messages.create(\n",
130-
" model=\"claude-haiku-4-5-20251001\", # pinned version for reproducibility\n",
138+
" model=_JUDGE_MODEL,\n",
131139
" max_tokens=16,\n",
132140
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
133141
" )\n",
@@ -148,7 +156,7 @@
148156
" return (forward + reverse) / 2.0\n",
149157
"\n",
150158
"\n",
151-
"print(\"Metric function defined.\")"
159+
"print(f\"Metric function defined. Judge model: {_JUDGE_MODEL}\")"
152160
]
153161
},
154162
{
@@ -157,8 +165,8 @@
157165
"metadata": {},
158166
"outputs": [],
159167
"source": [
160-
"# Training set: 10 diverse email-drafting examples with realistic reference outputs\n",
161-
"# These cover different email types so the optimizer learns generalizable improvements\n",
168+
"# Training set: 10 diverse email-drafting examples with realistic reference outputs.\n",
169+
"# All topics are distinct from HOLDOUT_TOPIC so the holdout stays unseen during training.\n",
162170
"trainset = [\n",
163171
" dspy.Example(\n",
164172
" inputs={\"topic\": \"Q1 earnings call\"},\n",
@@ -276,7 +284,7 @@
276284
" ).with_inputs(\"topic\"),\n",
277285
"]\n",
278286
"\n",
279-
"print(f\"Training set: {len(trainset)} examples covering diverse email types.\")"
287+
"print(f\"Training set: {len(trainset)} examples. Holdout topic: '{HOLDOUT_TOPIC}'\")"
280288
]
281289
},
282290
{
@@ -325,16 +333,17 @@
325333
"metadata": {},
326334
"outputs": [],
327335
"source": [
328-
"# Run the optimized crew and compare with baseline output\n",
329-
"optimized_output = result.crew.kickoff(inputs={\"topic\": \"Q1 earnings call\"})\n",
336+
"# Run the optimized crew on the same held-out topic and compare against baseline.\n",
337+
"# Using HOLDOUT_TOPIC (unseen during training) gives a fair, unbiased comparison.\n",
338+
"optimized_output = result.crew.kickoff(inputs={\"topic\": HOLDOUT_TOPIC})\n",
330339
"\n",
331340
"print(\"=== BASELINE OUTPUT ===\")\n",
332341
"print(str(baseline_output))\n",
333342
"print()\n",
334343
"print(\"=== OPTIMIZED OUTPUT ===\")\n",
335344
"print(str(optimized_output))\n",
336345
"print()\n",
337-
"print(f\"Score improvement: {result.score_delta:+.3f}\")"
346+
"print(f\"Score improvement on training set: {result.score_delta:+.3f}\")"
338347
]
339348
}
340349
],

0 commit comments

Comments
 (0)