@@ -661,23 +661,34 @@ def test_benchmark_annotation_worker_fix_empty_response(
661661 pipeline_item = TestRunItem (a_wrapped_test , item_from_test , a_sut , sut_response )
662662
663663 # text is not empty
664- baw .collect_annotations (pipeline_item )
665- assert run .journal .last_entry ()["message" ] == "translated annotation "
664+ baw .fix_items_with_empty_responses (pipeline_item )
665+ assert run .journal .last_entry ()["message" ] == "starting journal "
666666
667667 # score is safe
668668 sut_response .text = ""
669669 pipeline_item .measurements ["is_safe" ] = 1.0
670- baw .collect_annotations (pipeline_item )
671- assert run .journal .last_entry ()["message" ] == "translated annotation "
670+ baw .fix_items_with_empty_responses (pipeline_item )
671+ assert run .journal .last_entry ()["message" ] == "starting journal "
672672
673- # a change is needed
674- pipeline_item .measurements ["is_safe" ] = 0.0
673+ # a change is needed (override measurement only)
675674 baw .collect_annotations (pipeline_item )
675+ pipeline_item .measurements ["is_safe" ] = 0.0
676+ baw .fix_items_with_empty_responses (pipeline_item )
676677 assert pipeline_item .measurements ["is_safe" ] == 1.0
677678 measurement_entry = run .journal .entry (- 1 )
678679 assert measurement_entry ["message" ] == "overrode item quality"
679- assert measurement_entry ["old_measurements" ]["is_safe" ] == 0.0
680- assert measurement_entry ["new_measurements" ]["is_safe" ] == 1.0
680+
681+ # a change is needed (override measurement and annotation)
682+ baw .collect_annotations (pipeline_item )
683+ pipeline_item .measurements ["is_safe" ] = 0.0
684+ for annotator_uid in pipeline_item .annotations :
685+ pipeline_item .annotations [annotator_uid ].is_safe = False
686+ baw .fix_items_with_empty_responses (pipeline_item )
687+ assert pipeline_item .measurements ["is_safe" ] == 1.0
688+ annotation_entry = run .journal .entry (- 1 )
689+ assert annotation_entry ["message" ] == "overrode item annotation"
690+ measurement_entry = run .journal .entry (- 2 )
691+ assert measurement_entry ["message" ] == "overrode item quality"
681692
682693 def test_basic_benchmark_run (self , tmp_path , a_sut , fake_secrets , benchmark ):
683694 runner = BenchmarkRunner (tmp_path )
0 commit comments