|
| 1 | +// PromptFoo configuration for Mission Brief tests |
| 2 | +// Tests the /adlc.spec.specify command's Mission Brief enforcement |
| 3 | +module.exports = { |
| 4 | + description: 'Mission Brief Enforcement Evaluation', |
| 5 | + |
| 6 | + // Rate limiting to avoid 429 errors |
| 7 | + evaluateOptions: { |
| 8 | + maxConcurrency: 1, |
| 9 | + delay: process.env.CI ? 15000 : 5000, |
| 10 | + }, |
| 11 | + |
| 12 | + // Mission Brief prompt |
| 13 | + prompts: ['file://../prompts/mission-brief-prompt.txt'], |
| 14 | + |
| 15 | + // Configure LLM provider using OpenAI-compatible endpoint |
| 16 | + providers: [ |
| 17 | + { |
| 18 | + id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`, |
| 19 | + label: `${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`, |
| 20 | + config: { |
| 21 | + apiBaseUrl: process.env.LLM_BASE_URL, |
| 22 | + apiKey: process.env.LLM_AUTH_TOKEN, |
| 23 | + temperature: 0.7, |
| 24 | + max_tokens: 4000, |
| 25 | + }, |
| 26 | + env: { |
| 27 | + OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN, |
| 28 | + OPENAI_BASE_URL: process.env.LLM_BASE_URL, |
| 29 | + }, |
| 30 | + }, |
| 31 | + ], |
| 32 | + |
| 33 | + defaultTest: { |
| 34 | + options: { |
| 35 | + provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`, |
| 36 | + }, |
| 37 | + assert: [ |
| 38 | + { type: 'python', value: 'file://../graders/custom_graders.py:check_pii_leakage' }, |
| 39 | + { type: 'python', value: 'file://../graders/custom_graders.py:check_prompt_injection' }, |
| 40 | + { type: 'python', value: 'file://../graders/custom_graders.py:check_hallucination_signals' }, |
| 41 | + { type: 'python', value: 'file://../graders/custom_graders.py:check_misinformation' }, |
| 42 | + ], |
| 43 | + }, |
| 44 | + |
| 45 | + tests: [ |
| 46 | + // Test 1: Mission Brief Completeness - Substantial Input |
| 47 | + { |
| 48 | + description: 'Mission Brief: Extracts complete Mission Brief from detailed input', |
| 49 | + vars: { |
| 50 | + user_input: |
| 51 | + 'Build a user authentication system with email/password login, password reset via email, and session management. Users should be able to stay logged in for 30 days. The system must support 10,000 concurrent users and comply with GDPR for European users.', |
| 52 | + }, |
| 53 | + assert: [ |
| 54 | + { type: 'python', value: 'file://../graders/custom_graders.py:check_mission_brief_completeness' }, |
| 55 | + { type: 'icontains', value: 'goal' }, |
| 56 | + { type: 'icontains', value: 'success criteria' }, |
| 57 | + { type: 'icontains', value: 'demo sentence' }, |
| 58 | + { type: 'icontains', value: 'proceed with this mission brief' }, |
| 59 | + ], |
| 60 | + }, |
| 61 | + |
| 62 | + // Test 2: Mission Brief Quality - Goal Extraction |
| 63 | + { |
| 64 | + description: 'Mission Brief: Goal is concise and captures core purpose', |
| 65 | + vars: { |
| 66 | + user_input: |
| 67 | + 'Create a dashboard where admins can view real-time analytics including user signups, active sessions, revenue metrics, and system health. The dashboard should update every 5 seconds and support drill-down into individual metrics.', |
| 68 | + }, |
| 69 | + assert: [ |
| 70 | + { type: 'python', value: 'file://../graders/custom_graders.py:check_mission_brief_quality' }, |
| 71 | + { |
| 72 | + type: 'llm-rubric', |
| 73 | + value: |
| 74 | + 'Grade the Mission Brief Goal quality (0-1):\n' + |
| 75 | + '1. Is the Goal a single sentence?\n' + |
| 76 | + '2. Does it capture the core purpose (admin analytics dashboard)?\n' + |
| 77 | + '3. Is it technology-agnostic (no frameworks, databases)?\n' + |
| 78 | + '4. Is it measurable/achievable?\n' + |
| 79 | + 'Return average score 0-1.', |
| 80 | + threshold: 0.7, |
| 81 | + }, |
| 82 | + ], |
| 83 | + }, |
| 84 | + |
| 85 | + // Test 3: Mission Brief - Constraint Extraction |
| 86 | + { |
| 87 | + description: 'Mission Brief: Extracts constraints from requirements', |
| 88 | + vars: { |
| 89 | + user_input: |
| 90 | + 'Build a payment processing integration for our e-commerce platform. Must comply with PCI-DSS, support credit cards and PayPal, process transactions under 3 seconds, and work with our existing Django backend. Budget is limited so we need to use Stripe as the payment provider.', |
| 91 | + }, |
| 92 | + assert: [ |
| 93 | + { type: 'icontains', value: 'constraint' }, |
| 94 | + { type: 'icontains', value: 'pci' }, |
| 95 | + { |
| 96 | + type: 'llm-rubric', |
| 97 | + value: |
| 98 | + 'Check if the Mission Brief Constraints section includes:\n' + |
| 99 | + '1. PCI-DSS compliance requirement\n' + |
| 100 | + '2. Performance constraint (3 second processing)\n' + |
| 101 | + '3. Technical constraint (Django backend integration)\n' + |
| 102 | + '4. Budget/provider constraint (Stripe)\n' + |
| 103 | + 'Return 1.0 if all constraints captured, 0.5 if some, 0.0 if none.', |
| 104 | + threshold: 0.7, |
| 105 | + }, |
| 106 | + ], |
| 107 | + }, |
| 108 | + |
| 109 | + // Test 4: Mission Brief - Demo Sentence Observable |
| 110 | + { |
| 111 | + description: 'Mission Brief: Demo Sentence is observable and concrete', |
| 112 | + vars: { |
| 113 | + user_input: |
| 114 | + 'Add a file upload feature to the project management app. Users should be able to upload PDF, Word, and image files up to 25MB. Files should be attached to tasks and downloadable by team members.', |
| 115 | + }, |
| 116 | + assert: [ |
| 117 | + { type: 'icontains', value: 'demo sentence' }, |
| 118 | + { type: 'icontains', value: 'user can' }, |
| 119 | + { |
| 120 | + type: 'llm-rubric', |
| 121 | + value: |
| 122 | + 'Grade the Demo Sentence quality (0-1):\n' + |
| 123 | + '1. Does it describe an observable user action (not just "have file upload")?\n' + |
| 124 | + '2. Is it concrete and specific (mentions uploading/downloading files)?\n' + |
| 125 | + '3. Can a human verify this outcome (e.g., "upload a PDF and download it")?\n' + |
| 126 | + '4. Does it avoid implementation details?\n' + |
| 127 | + 'Return average score 0-1.', |
| 128 | + threshold: 0.7, |
| 129 | + }, |
| 130 | + ], |
| 131 | + }, |
| 132 | + |
| 133 | + // Test 5: Mission Brief - Minimal Input Triggers Questions |
| 134 | + { |
| 135 | + description: 'Mission Brief: Minimal input triggers clarifying questions', |
| 136 | + vars: { |
| 137 | + user_input: 'Add search', |
| 138 | + }, |
| 139 | + assert: [ |
| 140 | + { |
| 141 | + type: 'llm-rubric', |
| 142 | + value: |
| 143 | + 'Check if the response asks clarifying questions for the minimal input:\n' + |
| 144 | + '1. Does it ask about the goal/purpose of the search feature?\n' + |
| 145 | + '2. Does it ask about success criteria or expected outcomes?\n' + |
| 146 | + '3. Does it ask about constraints or requirements?\n' + |
| 147 | + '4. Does it NOT skip to creating a spec without gathering more info?\n' + |
| 148 | + 'Return 1.0 if proper questions asked, 0.0 if it proceeds without clarification.', |
| 149 | + threshold: 0.7, |
| 150 | + }, |
| 151 | + ], |
| 152 | + }, |
| 153 | + |
| 154 | + // Test 6: Mission Brief - Approval Prompt Present |
| 155 | + { |
| 156 | + description: 'Mission Brief: Always includes approval prompt', |
| 157 | + vars: { |
| 158 | + user_input: |
| 159 | + 'Create a notification system that sends email, SMS, and push notifications. Users can configure their preferences per notification type. Support for templated messages with variable substitution.', |
| 160 | + }, |
| 161 | + assert: [ |
| 162 | + { type: 'icontains', value: 'proceed' }, |
| 163 | + { type: 'icontains-any', value: ['yes / no', 'yes/no', '(yes', 'yes, no'] }, |
| 164 | + { |
| 165 | + type: 'llm-rubric', |
| 166 | + value: |
| 167 | + 'Check if the response includes a clear approval request:\n' + |
| 168 | + '1. Is there a "Proceed with this Mission Brief?" question?\n' + |
| 169 | + '2. Are options provided (yes/no/adjust or similar)?\n' + |
| 170 | + '3. Does the flow indicate waiting for user response before continuing?\n' + |
| 171 | + 'Return 1.0 if proper approval flow, 0.0 if missing.', |
| 172 | + threshold: 0.8, |
| 173 | + }, |
| 174 | + ], |
| 175 | + }, |
| 176 | + ], |
| 177 | +}; |
0 commit comments