Skip to content

Commit a09e149

Browse files
Fix IFEval correctness bugs in if_functions and IFEvalVerifier (allenai#1683)
* Bundle small correctness fixes from allenai#1615/allenai#1618/allenai#1619/allenai#1623/allenai#1625/allenai#1646/allenai#1651/allenai#1655 with regression tests Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Address PR review: inline write_header, pricing per 1M tokens, parameterize tests, dedupe if_functions Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Trim branch to IFEval-only changes; other fixes moved to allenai#1684/allenai#1685/allenai#1686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Address review: use word-boundary regex in validate_choice to avoid substring false positives Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Revert deletion of scripts/eval_constraints/if_functions.py (handled in separate PR) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Revert validate_choice regex change; add TODO comment with suggested word-boundary fix Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Update if_functions.py
1 parent a099ff4 commit a09e149

5 files changed

Lines changed: 61 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ All notable changes to this project will be documented in this file.
3838
- Add deprecation warning to `finetune.py` pointing users to the OLMo-core SFT implementation (https://github.com/allenai/open-instruct/pull/1574).
3939

4040
### Fixed
41+
- Bundle IFEval correctness fixes: `validate_choice` operand direction and `validate_frequency_capital_words` "around" tolerance in `open_instruct/if_functions.py`; `IFEvalVerifier` `ZeroDivisionError` when the instruction list is empty; deduplicate by deleting the unused `scripts/eval_constraints/if_functions.py` copy (supersedes #1615, #1646, #1655).
4142
- Fix NameError on `streaming_config/vllm_config` when importing from `grpo_fast.py` due to implicit global dependency. (https://github.com/allenai/open-instruct/pull/1675)
4243
- Fix `grpo.py` token-count inflation by emitting bool `response_masks` from `DataPreparationActor` (instead of int64 doc-id-valued masks) and dropping per-consumer `.bool()` coercions in `grpo_fast.py`, `grpo_utils.py`, and `olmo_core_train_modules.py`. Previously the OLMo-core path summed the doc-id-valued mask in `calculate_token_counts`, inflating `loss_denominator` by ~60× (https://github.com/allenai/open-instruct/pull/1668).
4344
- Fix hardcoded project version (https://github.com/allenai/open-instruct/pull/1636) by using setuptools scm's automatic versioning.

open_instruct/ground_truth_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ def __call__(
351351
rewards.append(1.0)
352352
else:
353353
rewards.append(0.0)
354-
return VerificationResult(score=sum(rewards) / len(rewards))
354+
return VerificationResult(score=sum(rewards) / max(len(rewards), 1))
355355

356356

357357
class IFEvalVerifierOld(VerifierFunction):

open_instruct/if_functions.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,9 @@ def validate_title(text: str) -> bool:
360360

361361
# Choose: From Answer with one of the following options: {options}
362362
def validate_choice(text: str, options: list) -> bool:
363-
return any(text in option for option in options)
363+
# substring matching yields false positives (e.g. option "A" matches "Apple"); if this
364+
# is a problem, switch to a regex, e.g. re.search(rf"(?<!\w){re.escape(str(option))}(?!\w)", text).
365+
return any(option in text for option in options)
364366

365367

366368
# Minimum Number Highlighted Section: Highlight at least {N} sections in your answer with markdown, i.e. *highlighted
@@ -430,7 +432,7 @@ def validate_frequency_capital_words(text: str, N: int, quantifier: str) -> bool
430432
if quantifier == "at least":
431433
return len(words) >= N
432434
elif quantifier == "around":
433-
return len(words) == N
435+
return abs(len(words) - N) <= max(round(N * 0.1), 1)
434436
elif quantifier == "at most":
435437
return len(words) <= N
436438
else:

open_instruct/test_ground_truth_utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from parameterized import parameterized
1212

13+
from open_instruct import ground_truth_utils
1314
from open_instruct.ground_truth_utils import (
1415
F1Verifier,
1516
GSM8KVerifier,
@@ -229,5 +230,16 @@ def test_cleanup_helpers_are_safe_noops(self):
229230
self.assertIsNone(asyncio.run(cleanup_all_llm_judge_clients()))
230231

231232

233+
class TestIFEvalVerifierEmptyInstructions(unittest.TestCase):
234+
"""Regression test for PR #1655: IFEvalVerifier crashed with
235+
ZeroDivisionError when the constraint's instruction_id list was empty."""
236+
237+
def test_empty_instruction_list_returns_zero_score(self):
238+
verifier = ground_truth_utils.IFEvalVerifier()
239+
label = str([{"instruction_id": [], "kwargs": []}])
240+
result = verifier(tokenized_prediction=[1, 2, 3], prediction="some non-empty response", label=label)
241+
self.assertEqual(result.score, 0.0)
242+
243+
232244
if __name__ == "__main__":
233245
unittest.main()

open_instruct/test_if_functions.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""Tests for IFEval constraint verifiers in open_instruct.if_functions.
2+
3+
Covers regression fixes from PRs #1615 / #1646 (validate_choice operand
4+
direction) and #1646 (validate_frequency_capital_words "around" tolerance).
5+
"""
6+
7+
import unittest
8+
9+
from parameterized import parameterized
10+
11+
from open_instruct import if_functions
12+
13+
14+
class TestValidateChoice(unittest.TestCase):
15+
@parameterized.expand(
16+
[
17+
("option_in_text", "I believe the answer is B", ["A", "B", "C", "D"], True),
18+
("option_appears", "I choose red", ["red", "blue"], True),
19+
("no_option_present", "I choose green", ["red", "blue"], False),
20+
("exact_match", "red", ["red", "blue"], True),
21+
]
22+
)
23+
def test_validate_choice(self, _name, text, options, expected):
24+
self.assertEqual(if_functions.validate_choice(text, options), expected)
25+
26+
27+
class TestValidateFrequencyCapitalWords(unittest.TestCase):
28+
@parameterized.expand(
29+
[
30+
("around_exact", "AAA BBB CCC DDD EEE", 5, "around", True),
31+
("around_off_by_one_low", "AAA BBB CCC DDD", 5, "around", True),
32+
("around_off_by_one_high", "AAA BBB CCC DDD EEE FFF", 5, "around", True),
33+
("around_far_off", "AAA BBB", 5, "around", False),
34+
("at_least_satisfied", "AAA BBB CCC", 2, "at least", True),
35+
("at_most_violated", "AAA BBB CCC DDD", 2, "at most", False),
36+
]
37+
)
38+
def test_validate_frequency_capital_words(self, _name, text, n, quantifier, expected):
39+
self.assertEqual(if_functions.validate_frequency_capital_words(text, n, quantifier), expected)
40+
41+
42+
if __name__ == "__main__":
43+
unittest.main()

0 commit comments

Comments
 (0)