|
| 1 | +"""Eval corpora for release gating (WDD-001).""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from teaagent.eval_suite import EvalCategory, EvalStore, EvalSuite, EvalTest |
| 6 | +from teaagent.prompt_regression import PromptRegressionEvaluator, PromptRegressionTest |
| 7 | + |
| 8 | +RELEASE_EVAL_SUITE_ID = 'release-eval-corpus' |
| 9 | +RELEASE_EVAL_SUITE_NAME = 'Release Eval Corpus' |
| 10 | + |
| 11 | + |
| 12 | +def create_conversational_quality_tests() -> list[PromptRegressionTest]: |
| 13 | + """Conversational regression corpus: clarify, interrupt, correct, recall.""" |
| 14 | + return [ |
| 15 | + PromptRegressionTest( |
| 16 | + test_id='conv-clarify-001', |
| 17 | + name='Clarification before action', |
| 18 | + prompt='Fix the auth bug.', |
| 19 | + expected_output=( |
| 20 | + 'Before I change code, could you clarify which auth flow failed ' |
| 21 | + 'and whether this is login, token refresh, or permission checks?' |
| 22 | + ), |
| 23 | + expected_behavior={'keywords': ['clarif', 'auth']}, |
| 24 | + tolerance_threshold=0.65, |
| 25 | + metadata={'axis': 'clarification'}, |
| 26 | + ), |
| 27 | + PromptRegressionTest( |
| 28 | + test_id='conv-interrupt-001', |
| 29 | + name='Graceful interruption handling', |
| 30 | + prompt='Stop — switch to writing tests only, no more refactors.', |
| 31 | + expected_output=( |
| 32 | + 'Understood. I will stop refactoring and focus only on adding tests ' |
| 33 | + 'from this point forward.' |
| 34 | + ), |
| 35 | + expected_behavior={'keywords': ['stop', 'tests']}, |
| 36 | + tolerance_threshold=0.65, |
| 37 | + metadata={'axis': 'interruption'}, |
| 38 | + ), |
| 39 | + PromptRegressionTest( |
| 40 | + test_id='conv-correct-001', |
| 41 | + name='User correction acknowledgment', |
| 42 | + prompt='No, the failing module is billing, not auth.', |
| 43 | + expected_output=( |
| 44 | + 'Thanks for the correction — I will target the billing module instead ' |
| 45 | + 'of auth and re-check the failing tests there.' |
| 46 | + ), |
| 47 | + expected_behavior={'keywords': ['billing', 'correction']}, |
| 48 | + tolerance_threshold=0.65, |
| 49 | + metadata={'axis': 'correction'}, |
| 50 | + ), |
| 51 | + PromptRegressionTest( |
| 52 | + test_id='conv-recall-001', |
| 53 | + name='Long-context recall', |
| 54 | + prompt='What was the budget cap and rollback rule we agreed on earlier?', |
| 55 | + expected_output=( |
| 56 | + 'Earlier you set a budget cap of 2000 cents with rollback required ' |
| 57 | + 'before any destructive shell command.' |
| 58 | + ), |
| 59 | + expected_behavior={ |
| 60 | + 'keywords': ['budget', 'rollback'], |
| 61 | + 'min_length': 40, |
| 62 | + }, |
| 63 | + tolerance_threshold=0.6, |
| 64 | + metadata={'axis': 'long_context_recall'}, |
| 65 | + ), |
| 66 | + ] |
| 67 | + |
| 68 | + |
| 69 | +def _to_eval_test( |
| 70 | + regression_test: PromptRegressionTest, |
| 71 | + *, |
| 72 | + category: EvalCategory, |
| 73 | +) -> EvalTest: |
| 74 | + return EvalTest( |
| 75 | + test_id=regression_test.test_id, |
| 76 | + name=regression_test.name, |
| 77 | + category=category, |
| 78 | + description=f'Eval corpus test: {regression_test.name}', |
| 79 | + metadata={ |
| 80 | + 'prompt': regression_test.prompt, |
| 81 | + 'expected_output': regression_test.expected_output, |
| 82 | + 'expected_behavior': regression_test.expected_behavior, |
| 83 | + 'tolerance_threshold': regression_test.tolerance_threshold, |
| 84 | + **regression_test.metadata, |
| 85 | + }, |
| 86 | + ) |
| 87 | + |
| 88 | + |
| 89 | +def register_release_eval_suite(store: EvalStore) -> str: |
| 90 | + """Register prompt + conversational tests in the release eval suite.""" |
| 91 | + evaluator = PromptRegressionEvaluator() |
| 92 | + existing = store.load_suite(RELEASE_EVAL_SUITE_ID) |
| 93 | + if existing is not None: |
| 94 | + return RELEASE_EVAL_SUITE_ID |
| 95 | + |
| 96 | + suite = EvalSuite( |
| 97 | + suite_id=RELEASE_EVAL_SUITE_ID, |
| 98 | + name=RELEASE_EVAL_SUITE_NAME, |
| 99 | + description='Prompt regression + conversational quality corpus (WDD-001).', |
| 100 | + ) |
| 101 | + for regression_test in ( |
| 102 | + *evaluator.create_default_regression_tests(), |
| 103 | + *create_conversational_quality_tests(), |
| 104 | + ): |
| 105 | + category = ( |
| 106 | + EvalCategory.CONVERSATIONAL |
| 107 | + if regression_test.test_id.startswith('conv-') |
| 108 | + else EvalCategory.PROMPT_REGRESSION |
| 109 | + ) |
| 110 | + suite.add_test(_to_eval_test(regression_test, category=category)) |
| 111 | + store.save_suite(suite) |
| 112 | + return suite.suite_id |
0 commit comments