-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest_existence_experiment.py
More file actions
70 lines (55 loc) · 2.63 KB
/
test_existence_experiment.py
File metadata and controls
70 lines (55 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
"""
Test script for the LLM Existence Experiment
This script demonstrates the existence experiment functionality without running the full experiment.
It shows the question templates and parsing mechanism.
"""
import sys
sys.path.append('src')
from existence_evaluator import ExistenceEvaluator
from config import ConfigManager
def test_question_templates():
"""Test the 10 question templates with sample model names"""
evaluator = ExistenceEvaluator(None, ConfigManager())
sample_model_names = ["GPT-4", "Claude-3", "Gemini-1.5", "Llama-3"]
print("🧪 LLM Existence Experiment - Question Templates Test\n")
for i, template in enumerate(evaluator.question_templates):
print(f"Template {i+1}: {template}")
print("Sample questions:")
for model in sample_model_names[:2]: # Show 2 examples
print(f" • {template.format(llm_name=model)}")
print()
def test_response_parsing():
"""Test response parsing for different template types"""
evaluator = ExistenceEvaluator(None, ConfigManager())
print("🔍 Response Parsing Test\n")
# Test cases: (template_id, response, expected_score)
test_cases = [
(0, "Yes", 1.0), # Binary yes/no
(0, "No", 0.0),
(1, "85", 0.85), # Probability 0-100
(5, "Yes", 1.0), # Binary yes/no (changed from scale)
(6, "No", 0.0), # Binary yes/no (changed from percentage)
(7, "True", 1.0), # True/false
(8, "Very likely", 1.0), # Likert scale
(8, "Unlikely", 0.25),
]
for template_id, response, expected_score in test_cases:
parsed, score, error = evaluator.parse_response(response, template_id)
status = "✅" if score == expected_score else "❌"
print(f"{status} Template {template_id}: '{response}' → score={score} (expected={expected_score})")
print()
def main():
"""Run tests and display experiment overview"""
test_question_templates()
test_response_parsing()
print("📊 Experiment Overview:")
print("• 10 models × 10 targets × 10 templates × 10 iterations = 10,000 total queries")
print("• Each query tests if evaluator model knows about target model")
print("• Responses are parsed and converted to existence scores (0.0-1.0)")
print("• Results aggregated into a 10×10 existence awareness matrix")
print("\n🚀 Ready to run: python -m src.cli existence-experiment")
print(" Use --iterations 100 for more statistical reliability (100,000 queries)")
print(" Use --no-visualize to skip plot generation")
if __name__ == "__main__":
main()