-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathstatistical-mode-example.yaml
More file actions
72 lines (60 loc) · 1.91 KB
/
statistical-mode-example.yaml
File metadata and controls
72 lines (60 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# EvalView Test Case: Statistical Mode Example
# Demonstrates variance-aware testing for non-deterministic AI agents
#
# Statistical mode runs the test multiple times and computes:
# - Pass rate across runs
# - Score mean, std dev, percentiles
# - Flakiness assessment
# - Confidence intervals
name: "Statistical Research Agent Test"
description: "Test research agent with variance analysis for non-deterministic outputs"
# Difficulty level for benchmarking and filtering
# Options: trivial, easy, medium, hard, expert
# Use: evalview run --difficulty medium
difficulty: medium
# Suite type for categorization
# Options: capability (hill-climbing) or regression (safety net)
suite_type: capability
# Configure adapter
adapter: langgraph
endpoint: http://localhost:2024
input:
query: "Research the latest developments in AI agents and summarize the key trends"
context:
max_steps: 5
expected:
tools:
- web_search
- retrieve_documents
- summarize
tool_sequence:
- web_search
- retrieve_documents
- summarize
output:
contains:
- "AI"
- "agents"
not_contains:
- "error"
- "failed"
thresholds:
min_score: 75
max_cost: 0.50
max_latency: 30000
# Statistical/Variance mode configuration
# When present, test runs multiple times and pass/fail is determined statistically
variance:
# Number of times to run the test (2-100, default: 10)
runs: 10
# Required proportion of runs that must pass (0.0-1.0, default: 0.8)
# e.g., 0.8 means 8/10 runs must pass for the test to be considered passing
pass_rate: 0.8
# Optional: Minimum mean score across all runs
# Ensures average performance meets a threshold
min_mean_score: 70
# Optional: Maximum allowed standard deviation
# Lower values enforce more consistent outputs
max_std_dev: 15
# Confidence level for statistical intervals (0.5-0.99, default: 0.95)
confidence_level: 0.95