Skip to content

Commit 6ed7b2f

Browse files
chore: misc updates to eval mode and dependencies
1 parent 83d834c commit 6ed7b2f

11 files changed

Lines changed: 13182 additions & 5571 deletions

File tree

client/package-lock.json

Lines changed: 12785 additions & 5555 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"knip": "knip"
2121
},
2222
"dependencies": {
23+
"@databricks/design-system": "^1.12.22",
2324
"@radix-ui/react-alert-dialog": "^1.1.15",
2425
"@radix-ui/react-avatar": "^1.1.10",
2526
"@radix-ui/react-dialog": "^1.1.14",
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/* generated using openapi-typescript-codegen -- do not edit */
2+
/* istanbul ignore file */
3+
/* tslint:disable */
4+
/* eslint-disable */
5+
export type CriterionEvaluation = {
6+
id: string;
7+
criterion_id: string;
8+
trace_id: string;
9+
workshop_id: string;
10+
judge_model: string;
11+
met: boolean;
12+
rationale?: (string | null);
13+
raw_response?: (Record<string, any> | null);
14+
created_at?: string;
15+
};
16+
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
/* generated using openapi-typescript-codegen -- do not edit */
2+
/* istanbul ignore file */
3+
/* tslint:disable */
4+
/* eslint-disable */
5+
export type CriterionEvaluationCreate = {
6+
judge_model: string;
7+
met: boolean;
8+
rationale?: (string | null);
9+
raw_response?: (Record<string, any> | null);
10+
};
11+
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
import React, { useEffect, useRef } from 'react';
2+
import { useTraceCriteria, useEvalResults, useCreateCriterionEvaluation } from '@/hooks/useWorkshopApi';
3+
import { Badge } from '@/components/ui/badge';
4+
import { Check, X, AlertTriangle, ChevronRight } from 'lucide-react';
5+
import ReactMarkdown from 'react-markdown';
6+
import remarkGfm from 'remark-gfm';
7+
8+
interface EvalGradingPanelProps {
9+
workshopId: string;
10+
traceId: string;
11+
activeMilestoneRef?: string | null;
12+
onHoverCriterion?: (milestoneRef: string | null) => void;
13+
onClose?: () => void;
14+
}
15+
16+
export const EvalGradingPanel: React.FC<EvalGradingPanelProps> = ({
17+
workshopId,
18+
traceId,
19+
activeMilestoneRef,
20+
onHoverCriterion,
21+
onClose,
22+
}) => {
23+
const { data: criteria = [], isLoading: criteriaLoading } = useTraceCriteria(workshopId, traceId);
24+
const { data: evalResults = [], isLoading: resultsLoading } = useEvalResults(workshopId, traceId, 'HUMAN');
25+
const createEval = useCreateCriterionEvaluation(workshopId, traceId);
26+
const scrollContainerRef = useRef<HTMLDivElement>(null);
27+
28+
useEffect(() => {
29+
if (activeMilestoneRef && scrollContainerRef.current) {
30+
const el = scrollContainerRef.current.querySelector(`[data-milestone-ref="${activeMilestoneRef}"]`);
31+
if (el) {
32+
el.scrollIntoView({ behavior: 'smooth', block: 'center' });
33+
}
34+
}
35+
}, [activeMilestoneRef]);
36+
37+
const traceScore = evalResults.find(r => r.trace_id === traceId);
38+
const criteriaResults = traceScore?.criteria_results || [];
39+
const hurdleResults = traceScore?.hurdle_results || [];
40+
41+
// Combine all results for easy lookup
42+
const allResults = [...criteriaResults, ...hurdleResults];
43+
const resultsByCriterionId = new Map(allResults.map(r => [r.criterion_id, r]));
44+
45+
const handleToggle = (criterionId: string, met: boolean) => {
46+
createEval.mutate({
47+
criterion_id: criterionId,
48+
judge_model: 'HUMAN',
49+
met,
50+
});
51+
};
52+
53+
if (criteriaLoading || resultsLoading) {
54+
return (
55+
<div className="flex items-center justify-center h-full text-slate-400">
56+
Loading criteria...
57+
</div>
58+
);
59+
}
60+
61+
if (criteria.length === 0) {
62+
return (
63+
<div className="flex flex-col items-center justify-center h-full p-6 text-center text-slate-500">
64+
<AlertTriangle className="w-8 h-8 mb-3 text-slate-300" />
65+
<p className="text-sm font-medium text-slate-600">No criteria defined</p>
66+
<p className="text-xs mt-1 max-w-[200px]">
67+
Create criteria in the Discussion tab to start grading.
68+
</p>
69+
</div>
70+
);
71+
}
72+
73+
// Calculate scores for the slider
74+
const rawScore = traceScore?.raw_score || 0;
75+
const maxPossible = traceScore?.max_possible || 0;
76+
const normalizedScore = traceScore?.normalized_score || 0;
77+
const hurdlePassed = traceScore?.hurdle_passed ?? true;
78+
79+
return (
80+
<div className="flex flex-col h-full overflow-hidden bg-white/80 backdrop-blur-2xl rounded-2xl">
81+
<div className="flex items-center justify-between px-4 pt-4 pb-2">
82+
<h3 className="text-lg font-bold text-slate-900 tracking-tight">
83+
Grading
84+
</h3>
85+
{onClose && (
86+
<button onClick={onClose} className="p-1.5 hover:bg-slate-100 rounded-full text-slate-400 hover:text-slate-600 transition-colors">
87+
<ChevronRight className="w-5 h-5" />
88+
</button>
89+
)}
90+
</div>
91+
<div
92+
ref={scrollContainerRef}
93+
className="flex-1 overflow-y-auto px-4 py-4 custom-scrollbar"
94+
>
95+
<table className="w-full text-sm text-left">
96+
<thead className="text-xs text-slate-500 uppercase bg-slate-50/50 sticky top-0 z-10 backdrop-blur-md">
97+
<tr>
98+
<th className="px-4 py-3 font-semibold rounded-tl-lg">Criterion</th>
99+
<th className="px-4 py-3 font-semibold w-24 text-center">Points</th>
100+
<th className="px-4 py-3 font-semibold w-32 text-center rounded-tr-lg">Present</th>
101+
</tr>
102+
</thead>
103+
<tbody className="divide-y divide-slate-100">
104+
{criteria.map((criterion) => {
105+
const result = resultsByCriterionId.get(criterion.id);
106+
const isHurdle = criterion.criterion_type === 'hurdle';
107+
const isMet = result?.met;
108+
109+
// Extract milestone ref from text if it exists (e.g. [m2](m2))
110+
const milestoneMatch = criterion.text.match(/\[m(\d+)\]\(m\d+\)/);
111+
const milestoneRef = milestoneMatch ? `m${milestoneMatch[1]}` : null;
112+
113+
return (
114+
<tr
115+
key={criterion.id}
116+
data-milestone-ref={milestoneRef}
117+
className="hover:bg-slate-50/50 transition-colors group"
118+
onMouseEnter={() => onHoverCriterion?.(milestoneRef)}
119+
onMouseLeave={() => onHoverCriterion?.(null)}
120+
>
121+
<td className="px-4 py-4">
122+
<div className="prose prose-sm prose-slate max-w-none">
123+
<ReactMarkdown
124+
remarkPlugins={[remarkGfm]}
125+
components={{
126+
p: ({ children }) => <p className="m-0 leading-relaxed font-medium text-slate-700">{children}</p>,
127+
a: ({ children }) => <span className="text-indigo-600 font-semibold">{children}</span>
128+
}}
129+
>
130+
{criterion.text}
131+
</ReactMarkdown>
132+
</div>
133+
</td>
134+
<td className="px-4 py-4 text-center">
135+
{isHurdle ? (
136+
<Badge variant="outline" className="bg-rose-50 text-rose-700 border-rose-200 uppercase tracking-wider text-[10px]">
137+
Gate
138+
</Badge>
139+
) : (
140+
<span className={`font-mono font-bold ${criterion.weight > 0 ? 'text-emerald-600' : 'text-rose-600'}`}>
141+
{criterion.weight > 0 ? '+' : ''}{criterion.weight}
142+
</span>
143+
)}
144+
</td>
145+
<td className="px-4 py-4">
146+
<div className="flex items-center justify-center gap-1 bg-slate-100/50 p-1 rounded-lg border border-slate-200/50">
147+
<button
148+
type="button"
149+
onClick={() => handleToggle(criterion.id, true)}
150+
className={`flex-1 flex items-center justify-center py-1.5 rounded-md transition-all ${
151+
isMet === true
152+
? 'bg-emerald-500 text-white shadow-sm'
153+
: 'text-slate-400 hover:text-emerald-600 hover:bg-emerald-50'
154+
}`}
155+
>
156+
<Check className="w-4 h-4" />
157+
</button>
158+
<button
159+
type="button"
160+
onClick={() => handleToggle(criterion.id, false)}
161+
className={`flex-1 flex items-center justify-center py-1.5 rounded-md transition-all ${
162+
isMet === false
163+
? 'bg-rose-500 text-white shadow-sm'
164+
: 'text-slate-400 hover:text-rose-600 hover:bg-rose-50'
165+
}`}
166+
>
167+
<X className="w-4 h-4" />
168+
</button>
169+
</div>
170+
</td>
171+
</tr>
172+
);
173+
})}
174+
</tbody>
175+
</table>
176+
</div>
177+
178+
{/* Score Bar (HealthBench style) */}
179+
<div className="mt-auto border-t border-slate-200 bg-slate-50/80 p-6">
180+
<div className="flex items-center justify-between mb-2">
181+
<span className="text-xs font-bold text-slate-500 uppercase tracking-wider">Actual Score</span>
182+
<span className="text-xs font-bold text-slate-500 uppercase tracking-wider">Max Score</span>
183+
</div>
184+
185+
<div className="relative h-2 bg-slate-200 rounded-full overflow-hidden mb-2">
186+
<div
187+
className={`absolute top-0 left-0 h-full rounded-full transition-all duration-500 ${
188+
!hurdlePassed ? 'bg-rose-500' : 'bg-emerald-500'
189+
}`}
190+
style={{ width: `${!hurdlePassed ? 0 : normalizedScore * 100}%` }}
191+
/>
192+
</div>
193+
194+
<div className="flex items-center justify-between">
195+
<span className={`text-lg font-bold font-mono ${!hurdlePassed ? 'text-rose-600' : 'text-slate-900'}`}>
196+
{!hurdlePassed ? '0 (Gate Failed)' : rawScore}
197+
</span>
198+
<span className="text-sm font-bold font-mono text-slate-400">
199+
{maxPossible}
200+
</span>
201+
</div>
202+
</div>
203+
</div>
204+
);
205+
};

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ dependencies = [
2727
"databricks-sdk>=0.14.0",
2828
"databricks-sql-connector>=3.1.0",
2929
"pydantic>=2.5.0",
30-
"pydantic-ai-slim[openai]>=0.2",
30+
"pydantic-ai-slim[openai,ag-ui]>=0.2",
3131
"python-multipart>=0.0.6",
3232
"python-dotenv>=1.0.0",
3333
"httpx>=0.25.0",
@@ -44,7 +44,6 @@ dependencies = [
4444
"psycopg-binary>=3.1.0", # Pre-built binary for psycopg
4545
"psycopg-pool>=3.1.0", # Connection pooling for psycopg
4646
"dspy>=3.1.3",
47-
"pydantic-ai-slim[openai]>=0.2",
4847
]
4948

5049
[tool.alembic]

server/database.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,7 @@ class DiscoveryAgentRunDB(Base):
723723
trigger_comment_id = Column(String, ForeignKey("discovery_comments.id"), nullable=False)
724724
status = Column(String, nullable=False, default="running") # running | completed | failed | timeout
725725
tool_calls_count = Column(Integer, nullable=False, default=0)
726+
events = Column(JSON, nullable=False, default=list)
726727
partial_output = Column(Text, nullable=False, default="")
727728
final_output = Column(Text, nullable=True)
728729
error = Column(Text, nullable=True)

server/routers/eval_mode.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
from server.database import get_db
1111
from server.models import (
12+
CriterionEvaluation,
13+
CriterionEvaluationCreate,
1214
TraceCriterion,
1315
TraceCriterionCreate,
1416
TraceCriterionUpdate,
@@ -81,10 +83,31 @@ async def get_trace_rubric(
8183
return EvalModeService.render_trace_rubric(workshop_id, trace_id, criteria)
8284

8385

86+
@router.post("/{workshop_id}/traces/{trace_id}/criteria/{criterion_id}/evaluations", response_model=CriterionEvaluation, status_code=status.HTTP_201_CREATED)
87+
async def create_criterion_evaluation(
88+
workshop_id: str,
89+
trace_id: str,
90+
criterion_id: str,
91+
data: CriterionEvaluationCreate,
92+
db: Session = Depends(get_db),
93+
) -> CriterionEvaluation:
94+
service = EvalCriteriaService(db)
95+
return service.create_evaluation(
96+
workshop_id=workshop_id,
97+
criterion_id=criterion_id,
98+
trace_id=trace_id,
99+
judge_model=data.judge_model,
100+
met=data.met,
101+
rationale=data.rationale,
102+
raw_response=data.raw_response,
103+
)
104+
105+
84106
@router.get("/{workshop_id}/eval-results", response_model=list[TraceEvalScore])
85107
async def get_eval_results(
86108
workshop_id: str,
87109
trace_id: str | None = Query(default=None),
110+
judge_model: str | None = Query(default=None),
88111
db: Session = Depends(get_db),
89112
) -> list[TraceEvalScore]:
90113
criteria_service = EvalCriteriaService(db)
@@ -99,7 +122,7 @@ async def get_eval_results(
99122
results: list[TraceEvalScore] = []
100123
for current_trace_id in trace_ids:
101124
criteria = criteria_service.list_criteria(workshop_id, current_trace_id)
102-
evaluations = criteria_service.list_evaluations(workshop_id, current_trace_id)
125+
evaluations = criteria_service.list_evaluations(workshop_id, current_trace_id, judge_model=judge_model)
103126
results.append(EvalModeService.aggregate_trace_score(current_trace_id, criteria, evaluations))
104127

105128
return results

0 commit comments

Comments
 (0)