databricks-solutions
diff --git a/‎client/package-lock.json‎
Lines changed: 12785 additions & 5555 deletions b/‎client/package-lock.json‎
Lines changed: 12785 additions & 5555 deletions
diff --git a/‎client/package.json‎
Lines changed: 1 addition & 0 deletions b/‎client/package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎client/src/client/models/CriterionEvaluation.ts‎
Lines changed: 16 additions & 0 deletions b/‎client/src/client/models/CriterionEvaluation.ts‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎client/src/client/models/CriterionEvaluationCreate.ts‎
Lines changed: 11 additions & 0 deletions b/‎client/src/client/models/CriterionEvaluationCreate.ts‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎client/src/components/eval/EvalGradingPanel.tsx‎
Lines changed: 205 additions & 0 deletions b/‎client/src/components/eval/EvalGradingPanel.tsx‎
Lines changed: 205 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 2 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎server/database.py‎
Lines changed: 1 addition & 0 deletions b/‎server/database.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎server/routers/eval_mode.py‎
Lines changed: 24 additions & 1 deletion b/‎server/routers/eval_mode.py‎
Lines changed: 24 additions & 1 deletion
@@ -20,6 +20,7 @@
     "knip": "knip"
   },
   "dependencies": {
+    "@databricks/design-system": "^1.12.22",
     "@radix-ui/react-alert-dialog": "^1.1.15",
     "@radix-ui/react-avatar": "^1.1.10",
     "@radix-ui/react-dialog": "^1.1.14",
 
@@ -0,0 +1,16 @@
+/* generated using openapi-typescript-codegen -- do not edit */
+/* istanbul ignore file */
+/* tslint:disable */
+/* eslint-disable */
+export type CriterionEvaluation = {
+    id: string;
+    criterion_id: string;
+    trace_id: string;
+    workshop_id: string;
+    judge_model: string;
+    met: boolean;
+    rationale?: (string | null);
+    raw_response?: (Record<string, any> | null);
+    created_at?: string;
+};
+
@@ -0,0 +1,11 @@
+/* generated using openapi-typescript-codegen -- do not edit */
+/* istanbul ignore file */
+/* tslint:disable */
+/* eslint-disable */
+export type CriterionEvaluationCreate = {
+    judge_model: string;
+    met: boolean;
+    rationale?: (string | null);
+    raw_response?: (Record<string, any> | null);
+};
+
@@ -0,0 +1,205 @@
+import React, { useEffect, useRef } from 'react';
+import { useTraceCriteria, useEvalResults, useCreateCriterionEvaluation } from '@/hooks/useWorkshopApi';
+import { Badge } from '@/components/ui/badge';
+import { Check, X, AlertTriangle, ChevronRight } from 'lucide-react';
+import ReactMarkdown from 'react-markdown';
+import remarkGfm from 'remark-gfm';
+
+interface EvalGradingPanelProps {
+  workshopId: string;
+  traceId: string;
+  activeMilestoneRef?: string | null;
+  onHoverCriterion?: (milestoneRef: string | null) => void;
+  onClose?: () => void;
+}
+
+export const EvalGradingPanel: React.FC<EvalGradingPanelProps> = ({
+  workshopId,
+  traceId,
+  activeMilestoneRef,
+  onHoverCriterion,
+  onClose,
+}) => {
+  const { data: criteria = [], isLoading: criteriaLoading } = useTraceCriteria(workshopId, traceId);
+  const { data: evalResults = [], isLoading: resultsLoading } = useEvalResults(workshopId, traceId, 'HUMAN');
+  const createEval = useCreateCriterionEvaluation(workshopId, traceId);
+  const scrollContainerRef = useRef<HTMLDivElement>(null);
+
+  useEffect(() => {
+    if (activeMilestoneRef && scrollContainerRef.current) {
+      const el = scrollContainerRef.current.querySelector(`[data-milestone-ref="${activeMilestoneRef}"]`);
+      if (el) {
+        el.scrollIntoView({ behavior: 'smooth', block: 'center' });
+      }
+    }
+  }, [activeMilestoneRef]);
+
+  const traceScore = evalResults.find(r => r.trace_id === traceId);
+  const criteriaResults = traceScore?.criteria_results || [];
+  const hurdleResults = traceScore?.hurdle_results || [];
+  
+  // Combine all results for easy lookup
+  const allResults = [...criteriaResults, ...hurdleResults];
+  const resultsByCriterionId = new Map(allResults.map(r => [r.criterion_id, r]));
+
+  const handleToggle = (criterionId: string, met: boolean) => {
+    createEval.mutate({
+      criterion_id: criterionId,
+      judge_model: 'HUMAN',
+      met,
+    });
+  };
+
+  if (criteriaLoading || resultsLoading) {
+    return (
+      <div className="flex items-center justify-center h-full text-slate-400">
+        Loading criteria...
+      </div>
+    );
+  }
+
+  if (criteria.length === 0) {
+    return (
+      <div className="flex flex-col items-center justify-center h-full p-6 text-center text-slate-500">
+        <AlertTriangle className="w-8 h-8 mb-3 text-slate-300" />
+        <p className="text-sm font-medium text-slate-600">No criteria defined</p>
+        <p className="text-xs mt-1 max-w-[200px]">
+          Create criteria in the Discussion tab to start grading.
+        </p>
+      </div>
+    );
+  }
+
+  // Calculate scores for the slider
+  const rawScore = traceScore?.raw_score || 0;
+  const maxPossible = traceScore?.max_possible || 0;
+  const normalizedScore = traceScore?.normalized_score || 0;
+  const hurdlePassed = traceScore?.hurdle_passed ?? true;
+
+  return (
+    <div className="flex flex-col h-full overflow-hidden bg-white/80 backdrop-blur-2xl rounded-2xl">
+      <div className="flex items-center justify-between px-4 pt-4 pb-2">
+        <h3 className="text-lg font-bold text-slate-900 tracking-tight">
+          Grading
+        </h3>
+        {onClose && (
+          <button onClick={onClose} className="p-1.5 hover:bg-slate-100 rounded-full text-slate-400 hover:text-slate-600 transition-colors">
+            <ChevronRight className="w-5 h-5" />
+          </button>
+        )}
+      </div>
+      <div 
+        ref={scrollContainerRef}
+        className="flex-1 overflow-y-auto px-4 py-4 custom-scrollbar"
+      >
+        <table className="w-full text-sm text-left">
+          <thead className="text-xs text-slate-500 uppercase bg-slate-50/50 sticky top-0 z-10 backdrop-blur-md">
+            <tr>
+              <th className="px-4 py-3 font-semibold rounded-tl-lg">Criterion</th>
+              <th className="px-4 py-3 font-semibold w-24 text-center">Points</th>
+              <th className="px-4 py-3 font-semibold w-32 text-center rounded-tr-lg">Present</th>
+            </tr>
+          </thead>
+          <tbody className="divide-y divide-slate-100">
+            {criteria.map((criterion) => {
+              const result = resultsByCriterionId.get(criterion.id);
+              const isHurdle = criterion.criterion_type === 'hurdle';
+              const isMet = result?.met;
+              
+              // Extract milestone ref from text if it exists (e.g. [m2](m2))
+              const milestoneMatch = criterion.text.match(/\[m(\d+)\]\(m\d+\)/);
+              const milestoneRef = milestoneMatch ? `m${milestoneMatch[1]}` : null;
+
+              return (
+                <tr 
+                  key={criterion.id} 
+                  data-milestone-ref={milestoneRef}
+                  className="hover:bg-slate-50/50 transition-colors group"
+                  onMouseEnter={() => onHoverCriterion?.(milestoneRef)}
+                  onMouseLeave={() => onHoverCriterion?.(null)}
+                >
+                  <td className="px-4 py-4">
+                    <div className="prose prose-sm prose-slate max-w-none">
+                      <ReactMarkdown
+                        remarkPlugins={[remarkGfm]}
+                        components={{
+                          p: ({ children }) => <p className="m-0 leading-relaxed font-medium text-slate-700">{children}</p>,
+                          a: ({ children }) => <span className="text-indigo-600 font-semibold">{children}</span>
+                        }}
+                      >
+                        {criterion.text}
+                      </ReactMarkdown>
+                    </div>
+                  </td>
+                  <td className="px-4 py-4 text-center">
+                    {isHurdle ? (
+                      <Badge variant="outline" className="bg-rose-50 text-rose-700 border-rose-200 uppercase tracking-wider text-[10px]">
+                        Gate
+                      </Badge>
+                    ) : (
+                      <span className={`font-mono font-bold ${criterion.weight > 0 ? 'text-emerald-600' : 'text-rose-600'}`}>
+                        {criterion.weight > 0 ? '+' : ''}{criterion.weight}
+                      </span>
+                    )}
+                  </td>
+                  <td className="px-4 py-4">
+                    <div className="flex items-center justify-center gap-1 bg-slate-100/50 p-1 rounded-lg border border-slate-200/50">
+                      <button
+                        type="button"
+                        onClick={() => handleToggle(criterion.id, true)}
+                        className={`flex-1 flex items-center justify-center py-1.5 rounded-md transition-all ${
+                          isMet === true
+                            ? 'bg-emerald-500 text-white shadow-sm'
+                            : 'text-slate-400 hover:text-emerald-600 hover:bg-emerald-50'
+                        }`}
+                      >
+                        <Check className="w-4 h-4" />
+                      </button>
+                      <button
+                        type="button"
+                        onClick={() => handleToggle(criterion.id, false)}
+                        className={`flex-1 flex items-center justify-center py-1.5 rounded-md transition-all ${
+                          isMet === false
+                            ? 'bg-rose-500 text-white shadow-sm'
+                            : 'text-slate-400 hover:text-rose-600 hover:bg-rose-50'
+                        }`}
+                      >
+                        <X className="w-4 h-4" />
+                      </button>
+                    </div>
+                  </td>
+                </tr>
+              );
+            })}
+          </tbody>
+        </table>
+      </div>
+
+      {/* Score Bar (HealthBench style) */}
+      <div className="mt-auto border-t border-slate-200 bg-slate-50/80 p-6">
+        <div className="flex items-center justify-between mb-2">
+          <span className="text-xs font-bold text-slate-500 uppercase tracking-wider">Actual Score</span>
+          <span className="text-xs font-bold text-slate-500 uppercase tracking-wider">Max Score</span>
+        </div>
+        
+        <div className="relative h-2 bg-slate-200 rounded-full overflow-hidden mb-2">
+          <div 
+            className={`absolute top-0 left-0 h-full rounded-full transition-all duration-500 ${
+              !hurdlePassed ? 'bg-rose-500' : 'bg-emerald-500'
+            }`}
+            style={{ width: `${!hurdlePassed ? 0 : normalizedScore * 100}%` }}
+          />
+        </div>
+        
+        <div className="flex items-center justify-between">
+          <span className={`text-lg font-bold font-mono ${!hurdlePassed ? 'text-rose-600' : 'text-slate-900'}`}>
+            {!hurdlePassed ? '0 (Gate Failed)' : rawScore}
+          </span>
+          <span className="text-sm font-bold font-mono text-slate-400">
+            {maxPossible}
+          </span>
+        </div>
+      </div>
+    </div>
+  );
+};
@@ -27,7 +27,7 @@ dependencies = [
     "databricks-sdk>=0.14.0",
     "databricks-sql-connector>=3.1.0",
     "pydantic>=2.5.0",
-    "pydantic-ai-slim[openai]>=0.2",
+    "pydantic-ai-slim[openai,ag-ui]>=0.2",
     "python-multipart>=0.0.6",
     "python-dotenv>=1.0.0",
     "httpx>=0.25.0",
@@ -44,7 +44,6 @@ dependencies = [
     "psycopg-binary>=3.1.0", # Pre-built binary for psycopg
     "psycopg-pool>=3.1.0", # Connection pooling for psycopg
     "dspy>=3.1.3",
-    "pydantic-ai-slim[openai]>=0.2",
 ]
 
 [tool.alembic]
 
@@ -723,6 +723,7 @@ class DiscoveryAgentRunDB(Base):
     trigger_comment_id = Column(String, ForeignKey("discovery_comments.id"), nullable=False)
     status = Column(String, nullable=False, default="running")  # running | completed | failed | timeout
     tool_calls_count = Column(Integer, nullable=False, default=0)
+    events = Column(JSON, nullable=False, default=list)
     partial_output = Column(Text, nullable=False, default="")
     final_output = Column(Text, nullable=True)
     error = Column(Text, nullable=True)
 
@@ -9,6 +9,8 @@
 
 from server.database import get_db
 from server.models import (
+    CriterionEvaluation,
+    CriterionEvaluationCreate,
     TraceCriterion,
     TraceCriterionCreate,
     TraceCriterionUpdate,
@@ -81,10 +83,31 @@ async def get_trace_rubric(
     return EvalModeService.render_trace_rubric(workshop_id, trace_id, criteria)
 
 
+@router.post("/{workshop_id}/traces/{trace_id}/criteria/{criterion_id}/evaluations", response_model=CriterionEvaluation, status_code=status.HTTP_201_CREATED)
+async def create_criterion_evaluation(
+    workshop_id: str,
+    trace_id: str,
+    criterion_id: str,
+    data: CriterionEvaluationCreate,
+    db: Session = Depends(get_db),
+) -> CriterionEvaluation:
+    service = EvalCriteriaService(db)
+    return service.create_evaluation(
+        workshop_id=workshop_id,
+        criterion_id=criterion_id,
+        trace_id=trace_id,
+        judge_model=data.judge_model,
+        met=data.met,
+        rationale=data.rationale,
+        raw_response=data.raw_response,
+    )
+
+
 @router.get("/{workshop_id}/eval-results", response_model=list[TraceEvalScore])
 async def get_eval_results(
     workshop_id: str,
     trace_id: str | None = Query(default=None),
+    judge_model: str | None = Query(default=None),
     db: Session = Depends(get_db),
 ) -> list[TraceEvalScore]:
     criteria_service = EvalCriteriaService(db)
@@ -99,7 +122,7 @@ async def get_eval_results(
     results: list[TraceEvalScore] = []
     for current_trace_id in trace_ids:
         criteria = criteria_service.list_criteria(workshop_id, current_trace_id)
-        evaluations = criteria_service.list_evaluations(workshop_id, current_trace_id)
+        evaluations = criteria_service.list_evaluations(workshop_id, current_trace_id, judge_model=judge_model)
         results.append(EvalModeService.aggregate_trace_score(current_trace_id, criteria, evaluations))
 
     return results