Skip to content

Commit 28a1861

Browse files
added question id
1 parent bcfe4c4 commit 28a1861

File tree

8 files changed

+549
-12
lines changed

8 files changed

+549
-12
lines changed

src/orchestrator/batch.ts

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export interface CompareOptions {
3535
judgeModel: string
3636
answeringModel: string
3737
sampling?: SamplingConfig
38+
questionIds?: string[]
3839
force?: boolean
3940
}
4041

@@ -146,7 +147,7 @@ export class BatchManager {
146147
}
147148

148149
async createManifest(options: CompareOptions): Promise<CompareManifest> {
149-
const { providers, benchmark, judgeModel, answeringModel, sampling } = options
150+
const { providers, benchmark, judgeModel, answeringModel, sampling, questionIds } = options
150151
const compareId = generateCompareId()
151152

152153
logger.info(`Loading benchmark: ${benchmark}`)
@@ -155,7 +156,37 @@ export class BatchManager {
155156
const allQuestions = benchmarkInstance.getQuestions()
156157

157158
let targetQuestionIds: string[]
158-
if (sampling) {
159+
if (questionIds && questionIds.length > 0) {
160+
// Validate that all provided IDs exist in the benchmark
161+
const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId))
162+
const validIds: string[] = []
163+
const invalidIds: string[] = []
164+
165+
for (const id of questionIds) {
166+
if (allQuestionIdsSet.has(id)) {
167+
validIds.push(id)
168+
} else {
169+
invalidIds.push(id)
170+
}
171+
}
172+
173+
if (invalidIds.length > 0) {
174+
logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`)
175+
}
176+
177+
if (validIds.length === 0) {
178+
throw new Error(
179+
`All provided questionIds are invalid. No matching questions found in benchmark "${benchmark}". ` +
180+
`Invalid IDs: ${invalidIds.join(", ")}`
181+
)
182+
}
183+
184+
targetQuestionIds = validIds
185+
logger.info(
186+
`Using explicit questionIds: ${validIds.length} valid questions` +
187+
(invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "")
188+
)
189+
} else if (sampling) {
159190
targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling)
160191
} else {
161192
targetQuestionIds = allQuestions.map((q) => q.questionId)

src/orchestrator/index.ts

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,35 @@ export class Orchestrator {
213213
effectiveLimit = limit
214214

215215
if (questionIds && questionIds.length > 0) {
216-
logger.info(`Using explicit questionIds: ${questionIds.length} questions`)
217-
targetQuestionIds = questionIds
216+
// Validate that all provided IDs exist in the benchmark
217+
const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId))
218+
const validIds: string[] = []
219+
const invalidIds: string[] = []
220+
221+
for (const id of questionIds) {
222+
if (allQuestionIdsSet.has(id)) {
223+
validIds.push(id)
224+
} else {
225+
invalidIds.push(id)
226+
}
227+
}
228+
229+
if (invalidIds.length > 0) {
230+
logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`)
231+
}
232+
233+
if (validIds.length === 0) {
234+
throw new Error(
235+
`All provided questionIds are invalid. No matching questions found in benchmark "${benchmarkName}". ` +
236+
`Invalid IDs: ${invalidIds.join(", ")}`
237+
)
238+
}
239+
240+
targetQuestionIds = validIds
241+
logger.info(
242+
`Using explicit questionIds: ${validIds.length} valid questions` +
243+
(invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "")
244+
)
218245
} else if (sampling) {
219246
logger.info(`Using sampling mode: ${sampling.mode}`)
220247
targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling)

src/server/routes/benchmarks.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,75 @@ export async function handleBenchmarksRoutes(req: Request, url: URL): Promise<Re
128128
}
129129
}
130130

131+
// POST /api/benchmarks/:name/expand-ids - Expand conversation/session patterns to question IDs
132+
const expandIdsMatch = pathname.match(/^\/api\/benchmarks\/([^/]+)\/expand-ids$/)
133+
if (method === "POST" && expandIdsMatch) {
134+
const benchmarkName = expandIdsMatch[1]
135+
136+
try {
137+
const body = await req.json()
138+
const { patterns } = body as { patterns: string[] }
139+
140+
if (!patterns || !Array.isArray(patterns)) {
141+
return json({ error: "patterns array is required" }, 400)
142+
}
143+
144+
const benchmark = createBenchmark(benchmarkName as any)
145+
await benchmark.load()
146+
const allQuestions = benchmark.getQuestions()
147+
148+
const expandedIds = new Set<string>()
149+
const patternResults: Record<string, string[]> = {}
150+
151+
for (const pattern of patterns) {
152+
const trimmed = pattern.trim()
153+
if (!trimmed) continue
154+
155+
const expanded: string[] = []
156+
157+
// Pattern 1: Conversation ID (e.g., "conv-26") - expand to all questions
158+
// Check if pattern ends with a number and doesn't have -q or -session suffix
159+
if (/^[a-zA-Z]+-\d+$/.test(trimmed)) {
160+
const matchingQuestions = allQuestions.filter((q) =>
161+
q.questionId.startsWith(trimmed + "-q")
162+
)
163+
matchingQuestions.forEach((q) => {
164+
expanded.push(q.questionId)
165+
expandedIds.add(q.questionId)
166+
})
167+
}
168+
// Pattern 2: Session ID (e.g., "conv-26-session_1" or "001be529-session-0")
169+
// Find all questions that reference this session
170+
else if (trimmed.includes("-session")) {
171+
const matchingQuestions = allQuestions.filter((q) =>
172+
q.haystackSessionIds.includes(trimmed)
173+
)
174+
matchingQuestions.forEach((q) => {
175+
expanded.push(q.questionId)
176+
expandedIds.add(q.questionId)
177+
})
178+
}
179+
// Pattern 3: Direct question ID - add as-is if it exists
180+
else {
181+
const exactMatch = allQuestions.find((q) => q.questionId === trimmed)
182+
if (exactMatch) {
183+
expanded.push(trimmed)
184+
expandedIds.add(trimmed)
185+
}
186+
}
187+
188+
patternResults[pattern] = expanded
189+
}
190+
191+
return json({
192+
expandedIds: Array.from(expandedIds),
193+
patternResults,
194+
})
195+
} catch (e) {
196+
return json({ error: e instanceof Error ? e.message : "Failed to expand IDs" }, 400)
197+
}
198+
}
199+
131200
// GET /api/models - List available models
132201
if (method === "GET" && pathname === "/api/models") {
133202
const openai = listModelsByProvider("openai").map((alias) => ({

src/server/routes/compare.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ export async function handleCompareRoutes(req: Request, url: URL): Promise<Respo
146146
if (method === "POST" && pathname === "/api/compare/start") {
147147
try {
148148
const body = await req.json()
149-
const { providers, benchmark, judgeModel, answeringModel, sampling, force } = body
149+
const { providers, benchmark, judgeModel, answeringModel, sampling, questionIds, force } =
150+
body
150151

151152
if (!providers || !Array.isArray(providers) || providers.length === 0) {
152153
return json({ error: "Missing or invalid providers array" }, 400)
@@ -165,6 +166,7 @@ export async function handleCompareRoutes(req: Request, url: URL): Promise<Respo
165166
judgeModel,
166167
answeringModel,
167168
sampling,
169+
questionIds,
168170
force,
169171
})
170172

@@ -387,6 +389,7 @@ async function initializeComparison(options: {
387389
judgeModel: string
388390
answeringModel: string
389391
sampling?: SamplingConfig
392+
questionIds?: string[]
390393
force?: boolean
391394
}): Promise<{ compareId: string }> {
392395
// Only await manifest creation - this is fast

src/server/routes/runs.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,14 @@ export async function handleRunsRoutes(req: Request, url: URL): Promise<Response
190190
answeringModel,
191191
limit,
192192
sampling,
193+
questionIds,
193194
concurrency,
194195
force,
195196
fromPhase,
196197
sourceRunId,
197198
} = body
198199
console.log("[API] Extracted sampling:", sampling)
200+
console.log("[API] Extracted questionIds:", questionIds)
199201
console.log("[API] Extracted concurrency:", concurrency)
200202

201203
if (!provider || !benchmark || !runId || !judgeModel) {
@@ -279,6 +281,7 @@ export async function handleRunsRoutes(req: Request, url: URL): Promise<Response
279281
answeringModel,
280282
limit,
281283
sampling,
284+
questionIds,
282285
concurrency,
283286
force: sourceRunId ? false : force,
284287
fromPhase: fromPhase as PhaseId | undefined,
@@ -374,6 +377,7 @@ async function runBenchmark(options: {
374377
answeringModel?: string
375378
limit?: number
376379
sampling?: SamplingConfig
380+
questionIds?: string[]
377381
concurrency?: ConcurrencyConfig
378382
force?: boolean
379383
fromPhase?: PhaseId
@@ -396,6 +400,7 @@ async function runBenchmark(options: {
396400
answeringModel: options.answeringModel,
397401
limit: options.limit,
398402
sampling: options.sampling,
403+
questionIds: options.questionIds,
399404
concurrency: options.concurrency,
400405
force: options.force,
401406
phases,

0 commit comments

Comments
 (0)