fix: auto-weight grouped rubrics shorthand by criteria count (#1099)

christso · claude · web-flow · commit d5f2b621b903 · 2026-04-15T08:25:35.000+10:00
* fix: auto-weight grouped rubrics shorthand by criteria count When string shorthand assertions are mixed with other explicit graders, the rubrics grader created from the strings now gets weight = number of criteria, making each user-visible assertion contribute equal weight to the overall score. Before: [contains, "A", "B", "C"] → contains(w=1) + rubrics(w=1) → 50/50 After: [contains, "A", "B", "C"] → contains(w=1) + rubrics(w=3) → 25/75 The shorthand abstraction is now transparent — users who write N string criteria alongside M explicit graders get equal weight per visible line, without needing to know about internal grader grouping. Closes #1098 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * style: fix biome formatting * test: remove redundant shorthand weight tests * style: fix trailing blank line --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -288,7 +288,15 @@ async function parseEvaluatorList(
           }
           const placeholderIndex = result.indexOf(PLACEHOLDER);
           if (strings.length > 0 && placeholderIndex !== -1) {
-            result[placeholderIndex] = { type: 'rubrics', criteria: strings };
+            // Set weight = number of criteria so each user-visible string assertion contributes
+            // equal weight to the overall score alongside other explicit graders.
+            // e.g. [contains, "crit1", "crit2", "crit3"] → contains(w=1) + rubrics(w=3)
+            // → each of the 4 visible assertions counts equally.
+            result[placeholderIndex] = {
+              type: 'rubrics',
+              criteria: strings,
+              weight: strings.length,
+            };
           } else if (placeholderIndex !== -1) {
             // All strings were empty — remove the placeholder
             result.splice(placeholderIndex, 1);
diff --git a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts
@@ -1989,6 +1989,32 @@ describe('parseEvaluators - string shorthand in assertions', () => {
 
     expect(evaluators).toBeUndefined();
   });
+
+  it('sets rubrics grader weight = criteria count when mixed with other graders', async () => {
+    // User sees 4 assertions; each should contribute equal weight.
+    // rubrics(w=3) + contains(w=1) → each visible assertion = 1/4.
+    const evaluators = await parseEvaluators(
+      {
+        assertions: [
+          'Identifies the undefined access',
+          'Suggests a null-safe fix',
+          'Explains why the original code is dangerous',
+          { type: 'contains', value: 'null' },
+        ],
+      },
+      undefined,
+      ['/tmp'],
+      'test-id',
+    );
+
+    expect(evaluators).toHaveLength(2);
+    const rubrics = evaluators?.[0] as LlmGraderEvaluatorConfig;
+    expect(rubrics.type).toBe('llm-grader');
+    expect(rubrics.rubrics).toHaveLength(3);
+    expect(rubrics.weight).toBe(3);
+    expect(evaluators?.[1].type).toBe('contains');
+    expect(evaluators?.[1].weight).toBeUndefined(); // explicit graders keep their own weight
+  });
 });
 
 describe('parseEvaluators - file:// prefix prompt resolution', () => {