|
12 | 12 | "- How to define a crew and a labeled training set\n", |
13 | 13 | "- How to write an LLM-judge metric\n", |
14 | 14 | "- How to run `DSPyOptimizer.compile()` and inspect the result\n", |
15 | | - "- How to compare baseline vs. optimized crew output\n", |
| 15 | + "- How to compare baseline vs. optimized crew output on a held-out topic\n", |
16 | 16 | "\n", |
17 | 17 | "**Prerequisites:**\n", |
18 | 18 | "```bash\n", |
|
37 | 37 | "from crewai import Agent, Task, Crew, LLM\n", |
38 | 38 | "from crewai.optimizers import DSPyOptimizer\n", |
39 | 39 | "\n", |
40 | | - "# Verify required keys are present before running expensive LLM calls\n", |
41 | | - "assert os.getenv(\"OPENAI_API_KEY\"), \"Set OPENAI_API_KEY in your environment\"\n", |
42 | | - "assert os.getenv(\"ANTHROPIC_API_KEY\"), \"Set ANTHROPIC_API_KEY in your environment\"\n", |
| 40 | + "# Verify required keys are present before running expensive LLM calls.\n", |
| 41 | + "# Raises RuntimeError immediately rather than failing midway through an optimization run.\n", |
| 42 | + "missing = [k for k in (\"OPENAI_API_KEY\", \"ANTHROPIC_API_KEY\") if not os.getenv(k)]\n", |
| 43 | + "if missing:\n", |
| 44 | + " raise RuntimeError(f\"Missing required environment variables: {', '.join(missing)}\")\n", |
43 | 45 | "\n", |
44 | 46 | "# Configure DSPy to use the same model as the crew (gpt-4o-mini is cost-efficient for optimization)\n", |
45 | 47 | "dspy.configure(lm=dspy.LM(\"openai/gpt-4o-mini\", temperature=0.0))\n", |
|
97 | 99 | "metadata": {}, |
98 | 100 | "outputs": [], |
99 | 101 | "source": [ |
100 | | - "# Baseline measurement: run the crew before optimization to establish the starting quality\n", |
101 | | - "baseline_output = crew.kickoff(inputs={\"topic\": \"Q1 earnings call\"})\n", |
| 102 | + "# Baseline measurement: run the crew on a held-out topic before optimization.\n", |
| 103 | + "# This topic does NOT appear in the training set so the comparison is fair.\n", |
| 104 | + "HOLDOUT_TOPIC = \"customer webinar follow-up\"\n", |
| 105 | + "baseline_output = crew.kickoff(inputs={\"topic\": HOLDOUT_TOPIC})\n", |
102 | 106 | "\n", |
103 | 107 | "print(\"=== BASELINE OUTPUT ===\")\n", |
104 | 108 | "print(str(baseline_output))" |
|
114 | 118 | "# Uses pairwise comparison with position-bias mitigation via order swap\n", |
115 | 119 | "_judge_client = anthropic.Anthropic(api_key=os.environ[\"ANTHROPIC_API_KEY\"])\n", |
116 | 120 | "\n", |
| 121 | + "# Use the stable alias so the notebook keeps working across minor model updates.\n", |
| 122 | + "# Override with JUDGE_MODEL env var if you need a specific pinned version.\n", |
| 123 | + "_JUDGE_MODEL = os.getenv(\"JUDGE_MODEL\", \"claude-haiku-4-5\")\n", |
| 124 | + "\n", |
117 | 125 | "def _pairwise_score(output_a: str, output_b: str) -> float:\n", |
118 | 126 | " \"\"\"Ask Claude to rate A vs B on email quality criteria. Returns float in [0.0, 1.0].\"\"\"\n", |
119 | 127 | " prompt = (\n", |
|
127 | 135 | " \"Number only:\"\n", |
128 | 136 | " )\n", |
129 | 137 | " resp = _judge_client.messages.create(\n", |
130 | | - " model=\"claude-haiku-4-5-20251001\", # pinned version for reproducibility\n", |
| 138 | + " model=_JUDGE_MODEL,\n", |
131 | 139 | " max_tokens=16,\n", |
132 | 140 | " messages=[{\"role\": \"user\", \"content\": prompt}],\n", |
133 | 141 | " )\n", |
|
148 | 156 | " return (forward + reverse) / 2.0\n", |
149 | 157 | "\n", |
150 | 158 | "\n", |
151 | | - "print(\"Metric function defined.\")" |
| 159 | + "print(f\"Metric function defined. Judge model: {_JUDGE_MODEL}\")" |
152 | 160 | ] |
153 | 161 | }, |
154 | 162 | { |
|
157 | 165 | "metadata": {}, |
158 | 166 | "outputs": [], |
159 | 167 | "source": [ |
160 | | - "# Training set: 10 diverse email-drafting examples with realistic reference outputs\n", |
161 | | - "# These cover different email types so the optimizer learns generalizable improvements\n", |
| 168 | + "# Training set: 10 diverse email-drafting examples with realistic reference outputs.\n", |
| 169 | + "# All topics are distinct from HOLDOUT_TOPIC so the holdout stays unseen during training.\n", |
162 | 170 | "trainset = [\n", |
163 | 171 | " dspy.Example(\n", |
164 | 172 | " inputs={\"topic\": \"Q1 earnings call\"},\n", |
|
276 | 284 | " ).with_inputs(\"topic\"),\n", |
277 | 285 | "]\n", |
278 | 286 | "\n", |
279 | | - "print(f\"Training set: {len(trainset)} examples covering diverse email types.\")" |
| 287 | + "print(f\"Training set: {len(trainset)} examples. Holdout topic: '{HOLDOUT_TOPIC}'\")" |
280 | 288 | ] |
281 | 289 | }, |
282 | 290 | { |
|
325 | 333 | "metadata": {}, |
326 | 334 | "outputs": [], |
327 | 335 | "source": [ |
328 | | - "# Run the optimized crew and compare with baseline output\n", |
329 | | - "optimized_output = result.crew.kickoff(inputs={\"topic\": \"Q1 earnings call\"})\n", |
| 336 | + "# Run the optimized crew on the same held-out topic and compare against baseline.\n", |
| 337 | + "# Using HOLDOUT_TOPIC (unseen during training) gives a fair, unbiased comparison.\n", |
| 338 | + "optimized_output = result.crew.kickoff(inputs={\"topic\": HOLDOUT_TOPIC})\n", |
330 | 339 | "\n", |
331 | 340 | "print(\"=== BASELINE OUTPUT ===\")\n", |
332 | 341 | "print(str(baseline_output))\n", |
333 | 342 | "print()\n", |
334 | 343 | "print(\"=== OPTIMIZED OUTPUT ===\")\n", |
335 | 344 | "print(str(optimized_output))\n", |
336 | 345 | "print()\n", |
337 | | - "print(f\"Score improvement: {result.score_delta:+.3f}\")" |
| 346 | + "print(f\"Score improvement on training set: {result.score_delta:+.3f}\")" |
338 | 347 | ] |
339 | 348 | } |
340 | 349 | ], |
|
0 commit comments