Skip to content

Commit 59ee907

Browse files
committed
Clamp task scores inside open interval
1 parent 3d26c56 commit 59ee907

File tree

4 files changed

+24
-3
lines changed

4 files changed

+24
-3
lines changed

inference.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"https://rohan556-openenv-code-review-arena.hf.space",
3939
]
4040
STDOUT_BROKEN = False
41+
ERROR_SCORE = 0.0001
4142

4243
BASELINE_FINDINGS: dict[str, list[dict[str, Any]]] = {
4344
"authz_admin_export": [
@@ -321,7 +322,7 @@ def emit_failed_task(task_id: str, step_number: int, expected: int) -> None:
321322
emit_block(
322323
"END",
323324
task=task_id,
324-
score=0.0,
325+
score=ERROR_SCORE,
325326
steps=safe_steps,
326327
grade="error",
327328
matched=0,

server/grader.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
"race_condition": {"race_condition", "concurrency", "double_spend", "atomicity"},
2828
"xss": {"xss", "cross_site_scripting", "unsafe_html"},
2929
}
30+
MIN_OPEN_SCORE = 0.0001
31+
MAX_OPEN_SCORE = 0.9999
3032

3133

3234
@dataclass(frozen=True)
@@ -38,6 +40,10 @@ class MatchBreakdown:
3840
semantic_score: float
3941

4042

43+
def clamp_open_score(value: float) -> float:
44+
return min(MAX_OPEN_SCORE, max(MIN_OPEN_SCORE, value))
45+
46+
4147
def normalize_text(value: str) -> str:
4248
return "".join(ch.lower() if ch.isalnum() else " " for ch in value).strip()
4349

@@ -218,6 +224,7 @@ def grade_submission(
218224
if not references:
219225
false_positive_penalty = min(1.0, 0.32 * len(findings))
220226
overall = max(0.0, 1.0 - false_positive_penalty - duplicate_penalty)
227+
overall = clamp_open_score(overall)
221228
summary = (
222229
"Correctly identified that the refactor is clean."
223230
if not findings
@@ -299,6 +306,7 @@ def grade_submission(
299306
- 0.14 * missed_penalty
300307
)
301308
overall = max(0.0, min(1.0, overall))
309+
overall = clamp_open_score(overall)
302310

303311
summary = (
304312
f"Matched {len(assignments)} of {len(references)} reference findings. "
@@ -322,4 +330,3 @@ def grade_submission(
322330
assessments=ordered_assessments,
323331
missed_reference_ids=[reference.id for reference in unmatched_refs],
324332
)
325-

tests/test_environment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_emit_failed_task_prints_parseable_error_episode(capsys):
146146
lines = capsys.readouterr().out.strip().splitlines()
147147
assert lines == [
148148
"[STEP] step=1 action=error reward=0.0 done=True phase=error",
149-
"[END] task=sql_injection_report_filters score=0.0 steps=1 grade=error matched=0 expected=1",
149+
"[END] task=sql_injection_report_filters score=0.0001 steps=1 grade=error matched=0 expected=1",
150150
]
151151

152152

tests/test_grader.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,16 @@ def test_clean_task_penalizes_false_positives():
5252
assert scorecard.overall_score < 0.75
5353
assert scorecard.false_positive_penalty > 0
5454

55+
56+
def test_clean_task_perfect_score_stays_below_one():
57+
task = TaskStore().get("safe_logging_refactor")
58+
scorecard = grade_submission(task, [], steps_used=1)
59+
60+
assert 0.0 < scorecard.overall_score < 1.0
61+
62+
63+
def test_bad_submission_score_stays_above_zero():
64+
task = TaskStore().get("sql_injection_report_filters")
65+
scorecard = grade_submission(task, [], steps_used=task.max_steps)
66+
67+
assert 0.0 < scorecard.overall_score < 1.0

0 commit comments

Comments
 (0)