Skip to content

Commit b5685ad

Browse files
committed
fix(evaluation): address code review issues
- Add Zod schema validation for layout judge response (prevents NaN from malformed model output) - Add activity tracking to generateFormSpecWithLayout (matches generateFormSpec) - Update formSpecGenerator type to accept activity-tracking params - Add evaluationRunSchema.parse() before writing layout evaluation results - Fix score() signature to include _groundTruth parameter per EvaluationKind interface - Remove erroneous groundTruth filter in layout evaluation subcommand - Export buildLayoutPrompt from form-documents public index - Fix test import to use public index instead of internal path - Remove buildLayoutJudgePrompt from evaluation public index (implementation detail)
1 parent db9cb4d commit b5685ad

9 files changed

Lines changed: 63 additions & 18 deletions

File tree

src/entrypoints/cli/commands/evaluate.ts

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -443,10 +443,9 @@ export async function evaluate(
443443
'../../../../fixtures/index'
444444
)
445445
const fixtures = loadAllFixturesForEvaluation()
446-
const withGT = fixtures.filter((f) => f.groundTruth !== undefined)
447446

448-
if (withGT.length === 0) {
449-
console.error('No fixtures with ground truth found.')
447+
if (fixtures.length === 0) {
448+
console.error('No fixtures found.')
450449
return 1
451450
}
452451

@@ -482,12 +481,12 @@ export async function evaluate(
482481
)
483482

484483
console.log(`Running layout evaluation: ${strategyMeta.metadata.name}`)
485-
console.log(`Fixtures: ${withGT.length}`)
484+
console.log(`Fixtures: ${fixtures.length}`)
486485

487486
const start = Date.now()
488487
const cases: RunResult['cases'] = []
489488

490-
for (const fixture of withGT) {
489+
for (const fixture of fixtures) {
491490
try {
492491
const result = await extractor.extract(fixture.pdf, {
493492
slug: fixture.slug,
@@ -527,6 +526,8 @@ export async function evaluate(
527526
cases,
528527
}
529528

529+
evaluationRunSchema.parse(runResult)
530+
530531
mkdirSync(outDir, { recursive: true })
531532
const jsonPath = join(outDir, `${strategyId}.json`)
532533
writeFileSync(jsonPath, JSON.stringify(runResult, null, 2))

src/services/evaluation/index.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,5 @@ export {
2525
export { createLlmJudgeKind } from './kinds/pdf-field-extraction-judge'
2626
export { shapingCommandsKind } from './kinds/shaping-commands'
2727
export { createBedrockLayoutJudge } from './layout-judge'
28-
// Layout judge prompt
29-
export { buildLayoutJudgePrompt } from './layout-judge-prompt'
3028
export { evaluationRunSchema } from './schemas'
3129
export type { RunResult } from './types'

src/services/evaluation/kinds/layout-quality.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
import type { DataCollectionSpec } from '../../data-collection'
22
import type { FormSpec } from '../../forms'
33
import type { CaseMetrics, EvaluationKind, SummaryMetrics } from '../types'
4+
import type { LayoutJudgeResponse } from '../layout-judge-schemas'
5+
6+
export type { LayoutJudgeResponse }
47

58
export interface LayoutQualityOutput {
69
spec: DataCollectionSpec
710
formSpec: FormSpec
811
}
912

10-
export interface LayoutJudgeResponse {
11-
scores: Record<string, { score: number; rationale: string }>
12-
}
13-
1413
export interface LayoutJudge {
1514
judge(
1615
spec: DataCollectionSpec,
@@ -41,7 +40,7 @@ export function createLayoutQualityKind(
4140
description:
4241
'Evaluates FormSpec layout quality using LLM-as-judge against a civic tech best practices rubric',
4342

44-
async score(output: LayoutQualityOutput): Promise<CaseMetrics> {
43+
async score(output: LayoutQualityOutput, _groundTruth: undefined): Promise<CaseMetrics> {
4544
const response = await judge.judge(output.spec, output.formSpec)
4645

4746
const metrics: Record<string, number> = {}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import { z } from 'zod'
2+
3+
const layoutJudgeEntrySchema = z.object({
4+
score: z.number().min(1).max(5),
5+
rationale: z.string(),
6+
})
7+
8+
export const layoutJudgeResponseSchema = z.object({
9+
scores: z.record(z.string(), layoutJudgeEntrySchema),
10+
})
11+
12+
export type LayoutJudgeResponse = z.infer<typeof layoutJudgeResponseSchema>

src/services/evaluation/layout-judge.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@ import { fromNodeProviderChain } from '@aws-sdk/credential-providers'
33
import { generateText } from 'ai'
44
import type { DataCollectionSpec } from '../data-collection'
55
import type { FormSpec } from '../forms'
6-
import type { LayoutJudge, LayoutJudgeResponse } from './kinds/layout-quality'
6+
import type { LayoutJudge } from './kinds/layout-quality'
77
import { buildLayoutJudgePrompt } from './layout-judge-prompt'
8+
import {
9+
type LayoutJudgeResponse,
10+
layoutJudgeResponseSchema,
11+
} from './layout-judge-schemas'
812

913
export function createBedrockLayoutJudge(model: string): LayoutJudge {
1014
const bedrock = createAmazonBedrock({
@@ -29,7 +33,7 @@ export function createBedrockLayoutJudge(model: string): LayoutJudge {
2933
const jsonStr = trimmed.startsWith('```')
3034
? trimmed.replace(/^```(?:json)?\s*\n?/, '').replace(/\n?```\s*$/, '')
3135
: trimmed
32-
return JSON.parse(jsonStr) as LayoutJudgeResponse
36+
return layoutJudgeResponseSchema.parse(JSON.parse(jsonStr))
3337
},
3438
}
3539
}

src/services/form-documents/extraction-steps.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,27 @@ ${JSON.stringify(spec, null, 2)}`,
9191
export async function generateFormSpecWithLayout(
9292
model: LanguageModel,
9393
spec: DataCollectionSpec,
94+
activityStore?: ActivityStore,
95+
userId?: string,
96+
projectId?: string,
97+
modelId?: string,
9498
): Promise<FormSpec> {
99+
const startTime = Date.now()
95100
const result = await generateText({
96101
model,
97102
maxOutputTokens: 8192,
98103
messages: [{ role: 'user', content: buildLayoutPrompt(spec) }],
99104
})
105+
if (activityStore && modelId) {
106+
trackLlmCall(activityStore, {
107+
userId,
108+
projectId,
109+
operation: 'extraction-formspec-layout',
110+
model: modelId,
111+
usage: result.usage,
112+
durationMs: Date.now() - startTime,
113+
})
114+
}
100115
return parseJsonResponse(result.text, formSpecSchema)
101116
}
102117

src/services/form-documents/extraction.ts

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,17 @@ export interface BedrockExtractorOptions {
113113
/**
114114
* Custom FormSpec generator for Step 2. When provided, replaces the
115115
* default `generateFormSpec` call. Use `generateFormSpecWithLayout`
116-
* for layout-aware generation.
116+
* for layout-aware generation. Receives the same activity-tracking
117+
* arguments as `generateFormSpec` so LLM cost is tracked regardless
118+
* of which generator is active.
117119
*/
118120
formSpecGenerator?: (
119121
model: LanguageModel,
120122
spec: DataCollectionSpec,
123+
activityStore?: ActivityStore,
124+
userId?: string,
125+
projectId?: string,
126+
modelId?: string,
121127
) => Promise<FormSpec>
122128
}
123129

@@ -324,7 +330,14 @@ ${exemplarSection}Guidelines:
324330
// Step 2: Generate default FormSpec from extracted spec
325331
const bedrockModel = bedrock(model)
326332
const formSpec = options?.formSpecGenerator
327-
? await options.formSpecGenerator(bedrockModel, spec)
333+
? await options.formSpecGenerator(
334+
bedrockModel,
335+
spec,
336+
options?.activityStore,
337+
extractionOptions?.userId,
338+
extractionOptions?.slug,
339+
model,
340+
)
328341
: await generateFormSpec(
329342
bedrockModel,
330343
spec,

src/services/form-documents/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export {
88
createCachedPdfExtractor,
99
} from './extraction'
1010
export { generateFormSpecWithLayout } from './extraction-steps'
11+
export { buildLayoutPrompt } from './layout-prompt'
1112
export { enumerateFields } from './field-mapping'
1213
export { fillPdf } from './filling'
1314
export { createMappingRegistry } from './mapping-registry'

test/form-documents/layout-prompt.test.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import { describe, expect, it } from 'bun:test'
22
import type { DataCollectionSpec } from '../../src/services/data-collection'
3-
import { generateFormSpecWithLayout } from '../../src/services/form-documents/extraction-steps'
4-
import { buildLayoutPrompt } from '../../src/services/form-documents/layout-prompt'
3+
import {
4+
buildLayoutPrompt,
5+
generateFormSpecWithLayout,
6+
} from '../../src/services/form-documents'
57

68
const smallSpec: DataCollectionSpec = {
79
id: 'contact-info',

0 commit comments

Comments
 (0)