Skip to content

Commit 268d04f

Browse files
Trim branch to IFEval-only changes; other fixes moved to #1684/#1685/#1686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent b80930c commit 268d04f

7 files changed

Lines changed: 27 additions & 128 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ All notable changes to this project will be documented in this file.
3838
- Add deprecation warning to `finetune.py` pointing users to the OLMo-core SFT implementation (https://github.com/allenai/open-instruct/pull/1574).
3939

4040
### Fixed
41-
- Bundle small correctness fixes: `validate_choice` operand direction and `validate_frequency_capital_words` "around" tolerance in `open_instruct/if_functions.py` and `scripts/eval_constraints/if_functions.py`; gpt-4o / gpt-4o-standard output price in `judge_utils.py` (10× undercount); CSV header handling in `save_benchmark_results_to_csv` and `save_completion_lengths`; `_get_batch_logps` division-by-zero on fully-masked sequences in `dpo_utils.py`; `IFEvalVerifier` `ZeroDivisionError` when the instruction list is empty (supersedes #1615, #1618, #1619, #1623, #1625, #1646, #1651, #1655).
41+
- Bundle IFEval correctness fixes: `validate_choice` operand direction and `validate_frequency_capital_words` "around" tolerance in `open_instruct/if_functions.py`; `IFEvalVerifier` `ZeroDivisionError` when the instruction list is empty; deduplicate by deleting the unused `scripts/eval_constraints/if_functions.py` copy (supersedes #1615, #1646, #1655).
4242
- Fix NameError on `streaming_config/vllm_config` when importing from `grpo_fast.py` due to implicit global dependency. (https://github.com/allenai/open-instruct/pull/1675)
4343
- Fix `grpo.py` token-count inflation by emitting bool `response_masks` from `DataPreparationActor` (instead of int64 doc-id-valued masks) and dropping per-consumer `.bool()` coercions in `grpo_fast.py`, `grpo_utils.py`, and `olmo_core_train_modules.py`. Previously the OLMo-core path summed the doc-id-valued mask in `calculate_token_counts`, inflating `loss_denominator` by ~60× (https://github.com/allenai/open-instruct/pull/1668).
4444
- Fix hardcoded project version (https://github.com/allenai/open-instruct/pull/1636) by using setuptools scm's automatic versioning.

open_instruct/benchmark_generators.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ def save_completion_lengths(batch_results: list[dict], timestamp: int, batch_idx
5757
with open(csv_path, "a", newline="") as csvfile:
5858
fieldnames = ["batch_num", "prompt_num", "completion_length"]
5959
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
60-
if csvfile.tell() == 0:
61-
writer.writeheader()
60+
writer.writeheader()
6261

6362
for batch_result in batch_results:
6463
response_lengths = batch_result["response_lengths"]
@@ -143,7 +142,7 @@ def save_benchmark_results_to_csv(
143142

144143
with csv_path.open("a", newline="") as csvfile:
145144
writer = csv.DictWriter(csvfile, fieldnames=list(row_data.keys()))
146-
if csvfile.tell() == 0:
145+
if not csv_path.exists():
147146
writer.writeheader()
148147
writer.writerow(row_data)
149148
logger.info(f"Saved benchmark results to {csv_path}")

open_instruct/dpo_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,7 @@ def _get_batch_logps(
707707
loss_mask = labels[:, 1:] != -100
708708

709709
if average_log_prob:
710-
return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1).clamp(min=1)
710+
return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
711711
else:
712712
return (per_token_logps * loss_mask).sum(-1)
713713

open_instruct/ground_truth_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from open_instruct import context_window_checker, logger_utils
2727
from open_instruct.if_functions import IF_FUNCTIONS_MAP
2828
from open_instruct.IFEvalG import instructions_registry
29-
from open_instruct.judge_utils import EXTRACTOR_MAP, JUDGE_PROMPT_MAP, PRICE_PER_MILLION_TOKENS, build_messages
29+
from open_instruct.judge_utils import EXTRACTOR_MAP, JUDGE_PROMPT_MAP, PRICE_PER_TOKEN, build_messages
3030
from open_instruct.math_utils import (
3131
get_unnormalized_answer,
3232
hendrycks_is_equiv,
@@ -743,9 +743,9 @@ def get_cost(self, response, model: str):
743743
model_name = model.split("/")[-1] # for litellm, discard the namespace
744744
model_name = model_name.replace("-standard", "") # azure OAI models have -standard in the name
745745
return (
746-
PRICE_PER_MILLION_TOKENS.get(model_name, {}).get("input", 0) * response.usage.prompt_tokens
747-
+ PRICE_PER_MILLION_TOKENS.get(model_name, {}).get("output", 0) * response.usage.completion_tokens
748-
) / 1_000_000
746+
PRICE_PER_TOKEN.get(model_name, {}).get("input", 0) * response.usage.prompt_tokens
747+
+ PRICE_PER_TOKEN.get(model_name, {}).get("output", 0) * response.usage.completion_tokens
748+
)
749749

750750
async def async_call(
751751
self,

open_instruct/judge_utils.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,24 @@
55

66
logger = logger_utils.setup_logger(__name__)
77

8-
PRICE_PER_MILLION_TOKENS = {
9-
"gpt-4": {"input": 30.0, "output": 60.0},
10-
"gpt-3.5-turbo": {"input": 1.5, "output": 2.0},
11-
"gpt-4-1106-preview": {"input": 10.0, "output": 30.0},
12-
"gpt-4o": {"input": 2.5, "output": 10.0},
13-
"gpt-4o-mini": {"input": 0.15, "output": 0.6},
14-
"gpt-4o-standard": {"input": 2.5, "output": 10.0},
15-
"gpt-4.1": {"input": 2.0, "output": 8.0},
16-
"gpt-4.1-standard": {"input": 2.0, "output": 8.0},
17-
"gpt-4.1-mini-standard": {"input": 0.4, "output": 1.6},
18-
"o3": {"input": 10.0, "output": 40.0},
19-
"o3-standard": {"input": 10.0, "output": 40.0},
20-
"claude-sonnet": {"input": 3.0, "output": 15.0},
21-
"deepseek-chat": {"input": 0.07, "output": 1.0},
22-
"deepseek-reasoner": {"input": 0.14, "output": 2.0},
23-
"claude-3-7-sonnet-20250219": {"input": 3.0, "output": 15.0},
24-
"minimax-m2.7": {"input": 0.3, "output": 1.2},
25-
"minimax-m2.7-highspeed": {"input": 0.6, "output": 2.4},
8+
PRICE_PER_TOKEN = {
9+
"gpt-4": {"input": 0.00003, "output": 0.00006},
10+
"gpt-3.5-turbo": {"input": 0.0000015, "output": 0.000002},
11+
"gpt-4-1106-preview": {"input": 0.00001, "output": 0.00003},
12+
"gpt-4o": {"input": 0.0000025, "output": 0.000001},
13+
"gpt-4o-mini": {"input": 0.00000015, "output": 0.0000006},
14+
"gpt-4o-standard": {"input": 0.0000025, "output": 0.000001},
15+
"gpt-4.1": {"input": 0.000002, "output": 0.000008},
16+
"gpt-4.1-standard": {"input": 0.000002, "output": 0.000008},
17+
"gpt-4.1-mini-standard": {"input": 0.0000004, "output": 0.0000016},
18+
"o3": {"input": 0.00001, "output": 0.00004},
19+
"o3-standard": {"input": 0.00001, "output": 0.00004},
20+
"claude-sonnet": {"input": 0.000003, "output": 0.000015},
21+
"deepseek-chat": {"input": 0.00000007, "output": 0.000001},
22+
"deepseek-reasoner": {"input": 0.00000014, "output": 0.000002},
23+
"claude-3-7-sonnet-20250219": {"input": 0.000003, "output": 0.000015},
24+
"minimax-m2.7": {"input": 0.0000003, "output": 0.0000012},
25+
"minimax-m2.7-highspeed": {"input": 0.0000006, "output": 0.0000024},
2626
}
2727

2828
# Define the templates for different judge types
Lines changed: 1 addition & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
1-
import csv
2-
import pathlib
3-
import tempfile
41
import unittest
5-
from unittest import mock
62

73
import parameterized
84

9-
from open_instruct import benchmark_generators, utils
5+
from open_instruct import utils
106

117

128
class TestBenchmark(unittest.TestCase):
@@ -18,78 +14,5 @@ def test_get_device_name(self, device_name, expected):
1814
self.assertEqual(result, expected)
1915

2016

21-
class TestSaveCompletionLengths(unittest.TestCase):
22-
"""Regression test for PR #1623: duplicate headers on every batch."""
23-
24-
def test_header_written_once_across_batches(self):
25-
with tempfile.TemporaryDirectory() as tmp:
26-
tmp_path = pathlib.Path(tmp)
27-
timestamp = 1234567890
28-
batches = [
29-
[{"response_lengths": [10, 20]}],
30-
[{"response_lengths": [30]}],
31-
[{"response_lengths": [40, 50, 60]}],
32-
]
33-
with mock.patch.object(benchmark_generators, "DATA_DIR", tmp_path):
34-
for batch_idx, batch_results in enumerate(batches):
35-
benchmark_generators.save_completion_lengths(batch_results, timestamp, batch_idx)
36-
37-
csv_path = tmp_path / f"completion_lengths_{timestamp}.csv"
38-
with csv_path.open() as f:
39-
rows = list(csv.reader(f))
40-
41-
self.assertEqual(rows[0], ["batch_num", "prompt_num", "completion_length"])
42-
for row in rows[1:]:
43-
self.assertNotEqual(row, ["batch_num", "prompt_num", "completion_length"])
44-
self.assertEqual(len(rows), 1 + 2 + 1 + 3)
45-
46-
47-
class TestSaveBenchmarkResultsToCsv(unittest.TestCase):
48-
"""Regression test for PR #1619: header never written because the
49-
Path('a') open call created the file before the existence check."""
50-
51-
def _streaming_config(self):
52-
return mock.MagicMock(num_unique_prompts_rollout=4, num_samples_per_prompt_rollout=2, response_length=128)
53-
54-
def _model_config(self):
55-
return mock.MagicMock(model_name_or_path="fake/model")
56-
57-
def _agg(self):
58-
return {
59-
"total_generation_time": 1.0,
60-
"total_weight_sync_time": 0.5,
61-
"total_num_new_tokens": 100,
62-
"avg_tokens_per_second": 50.0,
63-
"avg_mfu": 0.1,
64-
"avg_mbu": 0.2,
65-
"avg_generation_time": 0.5,
66-
"avg_weight_sync_time": 0.25,
67-
}
68-
69-
def test_header_written_exactly_once_across_calls(self):
70-
with tempfile.TemporaryDirectory() as tmp:
71-
tmp_path = pathlib.Path(tmp)
72-
with (
73-
mock.patch.object(benchmark_generators, "DATA_DIR", tmp_path),
74-
mock.patch.object(benchmark_generators, "aggregate_results", return_value=self._agg()),
75-
mock.patch.object(benchmark_generators.utils, "get_git_commit", return_value="abc"),
76-
):
77-
for _ in range(3):
78-
benchmark_generators.save_benchmark_results_to_csv(
79-
results=[{}],
80-
total_time=10.0,
81-
streaming_config=self._streaming_config(),
82-
model_config=self._model_config(),
83-
)
84-
85-
csv_path = tmp_path / "generator_benchmark_results.csv"
86-
with csv_path.open() as f:
87-
rows = list(csv.reader(f))
88-
self.assertEqual(rows[0][0], "git_commit")
89-
self.assertEqual(len(rows), 4)
90-
for row in rows[1:]:
91-
self.assertEqual(row[0], "abc")
92-
93-
9417
if __name__ == "__main__":
9518
unittest.main()

open_instruct/test_dpo_utils.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import unittest
55

66
import torch
7-
from parameterized import parameterized
87

98
from open_instruct import dpo_utils
109
from open_instruct.dataset_transformation import TokenizerConfig
@@ -88,28 +87,6 @@ def test_rewards_are_detached(self):
8887
self.assertFalse(rejected_rewards.requires_grad)
8988

9089

91-
class TestGetBatchLogps(unittest.TestCase):
92-
"""Regression tests for PR #1625: division by zero when a sequence has
93-
every label masked (`-100`) and `average_log_prob=True`."""
94-
95-
@parameterized.expand(
96-
[
97-
(
98-
"fully_masked_row_returns_zero_not_nan",
99-
[[-1.0, -2.0, -3.0, -4.0], [-1.0, -2.0, -3.0, -4.0]],
100-
[[0, 1, 2, 3], [-100, -100, -100, -100]],
101-
[-2.0, 0.0],
102-
),
103-
("partially_masked_row_matches_unclamped_average", [[-1.0, -2.0, -3.0, -4.0]], [[0, 1, 2, -100]], [-1.5]),
104-
]
105-
)
106-
def test_average_log_prob(self, _name, per_token_logps, labels, expected):
107-
result = dpo_utils._get_batch_logps(torch.tensor(per_token_logps), torch.tensor(labels), average_log_prob=True)
108-
self.assertFalse(torch.isnan(result).any())
109-
self.assertFalse(torch.isinf(result).any())
110-
torch.testing.assert_close(result, torch.tensor(expected))
111-
112-
11390
class TestSimPOLoss(unittest.TestCase):
11491
"""Tests for simpo_loss function."""
11592

0 commit comments

Comments
 (0)