@@ -169,9 +169,10 @@ def test_security_benchmark_definition_basics(prompt_set, fake_secrets):
169169@pytest .mark .parametrize ("prompt_set" , SECURITY_JAILBREAK_PROMPT_SETS .keys ())
170170def test_security_benchmark_definition_standards (prompt_set , fake_secrets ):
171171 def assert_hazard_standards (b ):
172- for h in b .hazards ():
173- # This is uncalibrated.
174- assert h .reference_standard () is None
172+ hazards = b .hazards ()
173+ assert hazards [0 ].reference_standard () is not None
174+ # Both hazards use the same reference!
175+ assert hazards [0 ].reference_standard () == hazards [1 ].reference_standard ()
175176
176177 # benchmark uses ensemble standards for practice and official sets.
177178 official_ensemble_security_benchmark = SecurityBenchmark (EN_US , prompt_set , evaluator = "ensemble" )
@@ -180,8 +181,7 @@ def assert_hazard_standards(b):
180181 Standards ._benchmark_standards_path (official_ensemble_security_benchmark .uid ).name
181182 == f"security_benchmark-0.5-{ EN_US } -official-ensemble.json"
182183 )
183- # TODO: Uncomment this when we calibrate
184- # assert official_default_security_benchmark.standards._data == official_ensemble_security_benchmark.standards._data
184+ assert official_default_security_benchmark .standards ._data == official_ensemble_security_benchmark .standards ._data
185185 assert_hazard_standards (official_ensemble_security_benchmark )
186186 assert_hazard_standards (official_default_security_benchmark )
187187
@@ -193,7 +193,7 @@ def check_score(benchmark, expected):
193193 for h in benchmark .hazards ():
194194 assert h .reference_standard () == expected
195195
196- expected = None # Update after calibration.
196+ expected = 0.8458333333333333
197197 check_score (SecurityBenchmark (EN_US , "official" , evaluator = "ensemble" ), expected )
198198 check_score (SecurityBenchmark (EN_US , "official" , evaluator = "default" ), expected )
199199 # TODO: Add more spot checks here when we add more benchmarks.
0 commit comments