Skip to content

Commit cf0ed98

Browse files
committed
fix: require --instructions and validate placeholders for add evaluator CLI mode
1 parent 4818a0f commit cf0ed98

1 file changed

Lines changed: 66 additions & 30 deletions

File tree

src/cli/primitives/EvaluatorPrimitive.ts

Lines changed: 66 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ import type { EvaluationLevel, Evaluator, EvaluatorConfig } from '../../schema';
33
import { EvaluationLevelSchema, EvaluatorSchema } from '../../schema';
44
import { getErrorMessage } from '../errors';
55
import type { RemovalPreview, RemovalResult, SchemaChange } from '../operations/remove/types';
6+
import {
7+
LEVEL_PLACEHOLDERS,
8+
RATING_SCALE_PRESETS,
9+
parseCustomRatingScale,
10+
validateInstructionPlaceholders,
11+
} from '../tui/screens/evaluator/types';
612
import { BasePrimitive } from './BasePrimitive';
713
import type { AddResult, AddScreenComponent, RemovableResource } from './types';
814
import type { Command } from '@commander-js/extra-typings';
@@ -113,21 +119,31 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
113119
}
114120

115121
registerCommands(addCmd: Command, removeCmd: Command): void {
122+
const presetIds = RATING_SCALE_PRESETS.map(p => p.id);
123+
116124
addCmd
117125
.command(this.kind)
118126
.description('Add a custom evaluator to the project')
119127
.option('--name <name>', 'Evaluator name')
120128
.option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL')
121129
.option('--model <model>', 'Bedrock model ID for LLM-as-a-Judge')
122-
.option('--instructions <text>', 'Evaluation prompt instructions')
123-
.option('--config <path>', 'Path to evaluator config JSON file (overrides --model, --instructions)')
130+
.option(
131+
'--instructions <text>',
132+
'Evaluation prompt instructions (must include level-appropriate placeholders, e.g. {context})'
133+
)
134+
.option('--rating-scale <preset>', `Rating scale preset: ${presetIds.join(', ')} (default: 1-5-quality)`)
135+
.option(
136+
'--config <path>',
137+
'Path to evaluator config JSON file (overrides --model, --instructions, --rating-scale)'
138+
)
124139
.option('--json', 'Output as JSON')
125140
.action(
126141
async (cliOptions: {
127142
name?: string;
128143
level?: string;
129144
model?: string;
130145
instructions?: string;
146+
ratingScale?: string;
131147
config?: string;
132148
json?: boolean;
133149
}) => {
@@ -138,62 +154,82 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
138154
}
139155

140156
if (cliOptions.name || cliOptions.json) {
141-
if (!cliOptions.name || !cliOptions.level) {
142-
const error = '--name and --level are required in non-interactive mode';
157+
const fail = (error: string) => {
143158
if (cliOptions.json) {
144159
console.log(JSON.stringify({ success: false, error }));
145160
} else {
146161
console.error(error);
147162
}
148163
process.exit(1);
164+
};
165+
166+
if (!cliOptions.name || !cliOptions.level) {
167+
fail('--name and --level are required in non-interactive mode');
149168
}
150169

151170
if (!cliOptions.config && !cliOptions.model) {
152-
const error = 'Either --config or --model is required';
153-
if (cliOptions.json) {
154-
console.log(JSON.stringify({ success: false, error }));
155-
} else {
156-
console.error(error);
157-
}
158-
process.exit(1);
171+
fail('Either --config or --model is required');
159172
}
160173

161174
const levelResult = EvaluationLevelSchema.safeParse(cliOptions.level);
162175
if (!levelResult.success) {
163-
const error = `Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`;
164-
if (cliOptions.json) {
165-
console.log(JSON.stringify({ success: false, error }));
166-
} else {
167-
console.error(error);
168-
}
169-
process.exit(1);
176+
fail(`Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`);
170177
}
171178

172179
let configJson: EvaluatorConfig;
173180
if (cliOptions.config) {
174181
const { readFileSync } = await import('fs');
175182
configJson = JSON.parse(readFileSync(cliOptions.config, 'utf-8')) as EvaluatorConfig;
176183
} else {
184+
// --instructions is required when not using --config
185+
if (!cliOptions.instructions) {
186+
const level = levelResult.data!;
187+
const placeholders = LEVEL_PLACEHOLDERS[level].map(p => `{${p}}`).join(', ');
188+
fail(
189+
`--instructions is required in non-interactive mode (or use --config). ` +
190+
`Must include at least one placeholder for ${level}: ${placeholders}`
191+
);
192+
}
193+
194+
// Validate placeholders
195+
const placeholderCheck = validateInstructionPlaceholders(cliOptions.instructions!, levelResult.data!);
196+
if (placeholderCheck !== true) {
197+
fail(placeholderCheck);
198+
}
199+
200+
// Resolve rating scale
201+
let ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale'];
202+
const scaleInput = cliOptions.ratingScale ?? '1-5-quality';
203+
204+
const preset = RATING_SCALE_PRESETS.find(p => p.id === scaleInput);
205+
if (preset) {
206+
ratingScale = preset.ratingScale;
207+
} else {
208+
// Try parsing as custom format: "1:Poor:Fails, 2:Fair:Partially meets" or "Pass:Meets, Fail:Does not"
209+
const isNumerical = /^\d/.test(scaleInput.trim());
210+
const parsed = parseCustomRatingScale(scaleInput, isNumerical ? 'numerical' : 'categorical');
211+
if (!parsed.success) {
212+
fail(
213+
`Invalid --rating-scale "${scaleInput}". Use a preset (${presetIds.join(', ')}) ` +
214+
`or custom format: "1:Label:Definition, 2:Label:Definition" (numerical) ` +
215+
`or "Label:Definition, Label:Definition" (categorical)`
216+
);
217+
}
218+
ratingScale = parsed.success ? parsed.ratingScale : undefined!;
219+
}
220+
177221
configJson = {
178222
llmAsAJudge: {
179223
model: cliOptions.model!,
180-
instructions: cliOptions.instructions ?? `Evaluate the quality. Context: {context}`,
181-
ratingScale: {
182-
numerical: [
183-
{ value: 1, label: 'Poor', definition: 'Fails to meet expectations' },
184-
{ value: 2, label: 'Fair', definition: 'Partially meets expectations' },
185-
{ value: 3, label: 'Good', definition: 'Meets expectations' },
186-
{ value: 4, label: 'Very Good', definition: 'Exceeds expectations' },
187-
{ value: 5, label: 'Excellent', definition: 'Far exceeds expectations' },
188-
],
189-
},
224+
instructions: cliOptions.instructions!,
225+
ratingScale,
190226
},
191227
};
192228
}
193229

194230
const result = await this.add({
195-
name: cliOptions.name,
196-
level: levelResult.data,
231+
name: cliOptions.name!,
232+
level: levelResult.data!,
197233
config: configJson,
198234
});
199235

0 commit comments

Comments
 (0)