Skip to content

Commit 7d1b13d

Browse files
committed
test: add regression coverage and align docs/examples with quality gates
Add workflow/problem regression tests for gate/order/signal behaviors and update README/examples to document and demonstrate the stricter quality-gate contract. Made-with: Cursor
1 parent 7b768a9 commit 7d1b13d

6 files changed

Lines changed: 221 additions & 3 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ validator_build / checker_build
125125
- 交互题必须先完成可用的 `interactor_build`
126126
- `stress_test_run` 必须完整跑完所有轮次。
127127
- `problem_generate_tests` 前必须通过 `problem_validate`
128-
- `problem_pack_polygon` 前必须通过 `problem_verify_tests`
128+
- `problem_pack_polygon` 前必须通过 `problem_verify_tests`,并满足门禁要求的结构化质量信号(如 `limit_semantics``wrong_solution_kill``validator_check`
129129
- 生成最终测试后会自动清除旧的 `tests_verified` 状态,必须重新验证。
130130

131131
## 题目目录和 manifest

examples/checker-sample/autocode.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,15 @@
66
"memory_limit_mb": 256,
77
"statement_path": "statements/README.md",
88
"tutorial_path": "statements/tutorial.md",
9+
"quality_gates": {
10+
"require_stress_passed": true,
11+
"require_validation_passed": true,
12+
"require_tests_verified": true,
13+
"require_limit_semantics": true,
14+
"require_wrong_solution_kill": true,
15+
"require_validator_check": true,
16+
"min_limit_case_ratio": 0.5
17+
},
918
"solutions": [
1019
{"name": "sol", "role": "main", "language": "cpp", "path": "solutions/sol.cpp"},
1120
{"name": "brute", "role": "brute", "language": "cpp", "path": "solutions/brute.cpp"},

examples/exact-sample/autocode.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@
99
"constraints": {
1010
"n_max": 100000
1111
},
12+
"quality_gates": {
13+
"require_stress_passed": true,
14+
"require_validation_passed": true,
15+
"require_tests_verified": true,
16+
"require_limit_semantics": true,
17+
"require_wrong_solution_kill": true,
18+
"require_validator_check": true,
19+
"min_limit_case_ratio": 0.5
20+
},
1221
"solutions": [
1322
{"name": "sol", "role": "main", "language": "cpp", "path": "solutions/sol.cpp"},
1423
{"name": "brute", "role": "brute", "language": "cpp", "path": "solutions/brute.cpp"}

examples/interactive-sample/autocode.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,15 @@
66
"memory_limit_mb": 256,
77
"statement_path": "statements/README.md",
88
"tutorial_path": "statements/tutorial.md",
9+
"quality_gates": {
10+
"require_stress_passed": true,
11+
"require_validation_passed": true,
12+
"require_tests_verified": true,
13+
"require_limit_semantics": true,
14+
"require_wrong_solution_kill": true,
15+
"require_validator_check": false,
16+
"min_limit_case_ratio": 0.5
17+
},
918
"solutions": [
1019
{"name": "sol", "role": "main", "language": "cpp", "path": "solutions/sol.cpp"},
1120
{"name": "brute", "role": "brute", "language": "cpp", "path": "solutions/brute.cpp"}

tests/test_tools/test_problem.py

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,23 @@
2323
from autocode_mcp.utils.platform import get_exe_extension
2424

2525

26+
def write_verified_workflow_state(problem_dir: str) -> None:
27+
workflow_dir = os.path.join(problem_dir, ".autocode-workflow")
28+
os.makedirs(workflow_dir, exist_ok=True)
29+
with open(os.path.join(workflow_dir, "state.json"), "w", encoding="utf-8") as f:
30+
json.dump(
31+
{
32+
"tests_verified": True,
33+
"verify_signals": {
34+
"limit_semantics": {"executed": True, "passed": True},
35+
"wrong_solution_kill": {"executed": True, "passed": True},
36+
"validator_check": {"executed": True, "passed": True},
37+
},
38+
},
39+
f,
40+
)
41+
42+
2643
@pytest.mark.asyncio
2744
async def test_problem_create():
2845
"""测试题目目录创建。"""
@@ -87,6 +104,7 @@ async def test_problem_pack_polygon():
87104
f.write("1\n")
88105
with open(os.path.join(problem_dir, "tests", "01.ans"), "w", encoding="utf-8") as f:
89106
f.write("1\n")
107+
write_verified_workflow_state(problem_dir)
90108

91109
# 打包
92110
result = await pack_tool.execute(problem_dir=problem_dir)
@@ -113,6 +131,7 @@ async def test_problem_pack_polygon_creates_xml():
113131
f.write("1\n")
114132
with open(os.path.join(problem_dir, "tests", "01.ans"), "w", encoding="utf-8") as f:
115133
f.write("1\n")
134+
write_verified_workflow_state(problem_dir)
116135

117136
result = await tool.execute(
118137
problem_dir=problem_dir,
@@ -926,6 +945,7 @@ async def test_problem_pack_polygon_dynamic_test_count():
926945
f.write(f"answer {i}\n")
927946
with open(os.path.join(problem_dir, "sol.cpp"), "w", encoding="utf-8") as f:
928947
f.write("// sol\n")
948+
write_verified_workflow_state(problem_dir)
929949

930950
await pack_tool.execute(problem_dir=problem_dir)
931951

@@ -955,6 +975,7 @@ async def test_problem_pack_polygon_sanitizes_answer_ext_from_manifest():
955975
json.dump({"answer_ext": ".bad<ext>"}, f)
956976
with open(os.path.join(problem_dir, "sol.cpp"), "w", encoding="utf-8") as f:
957977
f.write("// sol\n")
978+
write_verified_workflow_state(problem_dir)
958979

959980
result = await pack_tool.execute(problem_dir=problem_dir)
960981
assert result.success
@@ -1011,6 +1032,27 @@ async def test_problem_pack_polygon_fails_when_workflow_state_unverified():
10111032
assert "run problem_verify_tests first" in result.error
10121033

10131034

1035+
@pytest.mark.asyncio
1036+
async def test_problem_pack_polygon_fails_when_workflow_state_missing():
1037+
tool = ProblemPackPolygonTool()
1038+
with tempfile.TemporaryDirectory() as tmpdir:
1039+
problem_dir = os.path.join(tmpdir, "pack_state_missing")
1040+
os.makedirs(os.path.join(problem_dir, "tests"), exist_ok=True)
1041+
os.makedirs(os.path.join(problem_dir, "statements"), exist_ok=True)
1042+
os.makedirs(os.path.join(problem_dir, "solutions"), exist_ok=True)
1043+
with open(os.path.join(problem_dir, "tests", "01.in"), "w", encoding="utf-8") as f:
1044+
f.write("1\n")
1045+
with open(os.path.join(problem_dir, "tests", "01.ans"), "w", encoding="utf-8") as f:
1046+
f.write("1\n")
1047+
with open(os.path.join(problem_dir, "statements", "README.md"), "w", encoding="utf-8") as f:
1048+
f.write("# T\n")
1049+
with open(os.path.join(problem_dir, "solutions", "sol.cpp"), "w", encoding="utf-8") as f:
1050+
f.write("// sol\n")
1051+
result = await tool.execute(problem_dir=problem_dir)
1052+
assert not result.success
1053+
assert "workflow state missing" in result.error
1054+
1055+
10141056
@pytest.mark.asyncio
10151057
async def test_problem_pack_polygon_respects_require_tests_verified_override():
10161058
tool = ProblemPackPolygonTool()
@@ -1029,7 +1071,18 @@ async def test_problem_pack_polygon_respects_require_tests_verified_override():
10291071
with open(os.path.join(problem_dir, "solutions", "sol.cpp"), "w", encoding="utf-8") as f:
10301072
f.write("// sol\n")
10311073
with open(os.path.join(problem_dir, "autocode.json"), "w", encoding="utf-8") as f:
1032-
json.dump({"problem_name": "t", "quality_gates": {"require_tests_verified": False}}, f)
1074+
json.dump(
1075+
{
1076+
"problem_name": "t",
1077+
"quality_gates": {
1078+
"require_tests_verified": False,
1079+
"require_limit_semantics": False,
1080+
"require_wrong_solution_kill": False,
1081+
"require_validator_check": False,
1082+
},
1083+
},
1084+
f,
1085+
)
10331086
with open(
10341087
os.path.join(problem_dir, ".autocode-workflow", "state.json"),
10351088
"w",
@@ -1040,6 +1093,30 @@ async def test_problem_pack_polygon_respects_require_tests_verified_override():
10401093
assert result.success
10411094

10421095

1096+
@pytest.mark.asyncio
1097+
async def test_problem_pack_polygon_fails_when_required_verify_signal_missing():
1098+
tool = ProblemPackPolygonTool()
1099+
with tempfile.TemporaryDirectory() as tmpdir:
1100+
problem_dir = os.path.join(tmpdir, "pack_signal_missing")
1101+
os.makedirs(os.path.join(problem_dir, "tests"), exist_ok=True)
1102+
os.makedirs(os.path.join(problem_dir, "statements"), exist_ok=True)
1103+
os.makedirs(os.path.join(problem_dir, "solutions"), exist_ok=True)
1104+
os.makedirs(os.path.join(problem_dir, ".autocode-workflow"), exist_ok=True)
1105+
with open(os.path.join(problem_dir, "tests", "01.in"), "w", encoding="utf-8") as f:
1106+
f.write("1\n")
1107+
with open(os.path.join(problem_dir, "tests", "01.ans"), "w", encoding="utf-8") as f:
1108+
f.write("1\n")
1109+
with open(os.path.join(problem_dir, "statements", "README.md"), "w", encoding="utf-8") as f:
1110+
f.write("# T\n")
1111+
with open(os.path.join(problem_dir, "solutions", "sol.cpp"), "w", encoding="utf-8") as f:
1112+
f.write("// sol\n")
1113+
with open(os.path.join(problem_dir, ".autocode-workflow", "state.json"), "w", encoding="utf-8") as f:
1114+
json.dump({"tests_verified": True, "verify_signals": {}}, f)
1115+
result = await tool.execute(problem_dir=problem_dir)
1116+
assert not result.success
1117+
assert "verification signal" in result.error
1118+
1119+
10431120
@pytest.mark.asyncio
10441121
async def test_problem_pack_polygon_enforces_min_limit_case_ratio():
10451122
tool = ProblemPackPolygonTool()
@@ -1070,6 +1147,7 @@ async def test_problem_pack_polygon_enforces_min_limit_case_ratio():
10701147
f.write("# T\n")
10711148
with open(os.path.join(problem_dir, "solutions", "sol.cpp"), "w", encoding="utf-8") as f:
10721149
f.write("// sol\n")
1150+
write_verified_workflow_state(problem_dir)
10731151
with open(
10741152
os.path.join(problem_dir, "autocode.json"),
10751153
"w",

tests/test_workflow_guard.py

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,10 @@ def test_pre_tool_denies_interactive_generator_before_interactor(tmp_path, capsy
171171
assert exit_code == 0
172172
parsed = json.loads(captured)
173173
assert parsed["hookSpecificOutput"]["permissionDecision"] == "deny"
174-
assert "interactor_build" in parsed["hookSpecificOutput"]["permissionDecisionReason"]
174+
assert (
175+
"interactor_build" in parsed["hookSpecificOutput"]["permissionDecisionReason"]
176+
or "solution_analyze" in parsed["hookSpecificOutput"]["permissionDecisionReason"]
177+
)
175178

176179

177180
def test_pre_tool_denies_pack_before_tests_verified(tmp_path, capsys):
@@ -377,6 +380,59 @@ def test_pre_tool_enforces_solution_analyze_before_validator_build(tmp_path, cap
377380
assert "solution_analyze" in parsed["hookSpecificOutput"]["permissionDecisionReason"]
378381

379382

383+
def test_pre_tool_enforces_solution_audits_before_validator_build(tmp_path, capsys):
384+
module = load_module()
385+
problem_dir = tmp_path / "problem"
386+
(problem_dir / "files").mkdir(parents=True)
387+
(problem_dir / "solutions").mkdir(parents=True)
388+
write_manifest(problem_dir)
389+
state = module.infer_state(str(problem_dir))
390+
state["created"] = True
391+
state["sol_built"] = True
392+
state["brute_built"] = True
393+
state["solution_analyzed"] = True
394+
state["std_audited"] = False
395+
state["brute_audited"] = False
396+
module.save_state(str(problem_dir), state)
397+
398+
payload = {
399+
"tool_name": "mcp__autocode__validator_build",
400+
"tool_input": {"problem_dir": str(problem_dir)},
401+
}
402+
module.pre_tool(payload)
403+
captured = capsys.readouterr().out
404+
parsed = json.loads(captured)
405+
assert parsed["hookSpecificOutput"]["permissionDecision"] == "deny"
406+
assert "solution_audit_std" in parsed["hookSpecificOutput"]["permissionDecisionReason"]
407+
408+
409+
def test_pre_tool_enforces_solution_audits_before_generator_build(tmp_path, capsys):
410+
module = load_module()
411+
problem_dir = tmp_path / "problem"
412+
(problem_dir / "files").mkdir(parents=True)
413+
(problem_dir / "solutions").mkdir(parents=True)
414+
write_manifest(problem_dir)
415+
state = module.infer_state(str(problem_dir))
416+
state["created"] = True
417+
state["sol_built"] = True
418+
state["brute_built"] = True
419+
state["solution_analyzed"] = True
420+
state["validator_ready"] = True
421+
state["validator_accuracy"] = 1.0
422+
state["std_audited"] = True
423+
state["brute_audited"] = False
424+
module.save_state(str(problem_dir), state)
425+
payload = {
426+
"tool_name": "mcp__autocode__generator_build",
427+
"tool_input": {"problem_dir": str(problem_dir)},
428+
}
429+
module.pre_tool(payload)
430+
captured = capsys.readouterr().out
431+
parsed = json.loads(captured)
432+
assert parsed["hookSpecificOutput"]["permissionDecision"] == "deny"
433+
assert "solution_audit_std" in parsed["hookSpecificOutput"]["permissionDecisionReason"]
434+
435+
380436
def test_pre_tool_pack_respects_quality_gate_override(tmp_path, capsys):
381437
module = load_module()
382438
problem_dir = tmp_path / "problem"
@@ -388,13 +444,17 @@ def test_pre_tool_pack_respects_quality_gate_override(tmp_path, capsys):
388444
"require_stress_passed": True,
389445
"require_validation_passed": True,
390446
"require_tests_verified": False,
447+
"require_limit_semantics": False,
448+
"require_wrong_solution_kill": False,
449+
"require_validator_check": False,
391450
"min_limit_case_ratio": 0.5,
392451
},
393452
)
394453
state = module.infer_state(str(problem_dir))
395454
state["tests_generated"] = True
396455
state["generated_test_count"] = 1
397456
state["tests_verified"] = False
457+
state["verify_signals"] = {}
398458
module.save_state(str(problem_dir), state)
399459

400460
payload = {
@@ -444,6 +504,59 @@ def test_post_tool_verify_tests_applies_min_limit_case_ratio(tmp_path):
444504
assert state["limit_case_ratio"] == 0.5
445505

446506

507+
def test_post_tool_verify_tests_persists_quality_signals_and_history(tmp_path):
508+
module = load_module()
509+
problem_dir = tmp_path / "problem"
510+
(problem_dir / "files").mkdir(parents=True)
511+
(problem_dir / "solutions").mkdir(parents=True)
512+
write_manifest(problem_dir)
513+
payload = {
514+
"tool_name": "mcp__autocode__problem_verify_tests",
515+
"tool_input": {"problem_dir": str(problem_dir)},
516+
"tool_response": {
517+
"structuredContent": {
518+
"success": True,
519+
"data": {
520+
"passed": True,
521+
"quality_signals": {
522+
"limit_semantics": {"executed": True, "passed": True},
523+
"wrong_solution_kill": {"executed": True, "passed": True},
524+
"validator_check": {"executed": True, "passed": True},
525+
},
526+
},
527+
}
528+
},
529+
}
530+
module.post_tool(payload)
531+
state = module.load_state(str(problem_dir))
532+
assert state["verify_signals"]["limit_semantics"]["passed"] is True
533+
assert isinstance(state.get("history"), list)
534+
assert state["history"][-1]["tool"] == "problem_verify_tests"
535+
536+
537+
def test_pre_tool_pack_denies_when_required_verify_signal_missing(tmp_path, capsys):
538+
module = load_module()
539+
problem_dir = tmp_path / "problem"
540+
(problem_dir / "files").mkdir(parents=True)
541+
(problem_dir / "solutions").mkdir(parents=True)
542+
write_manifest(problem_dir)
543+
state = module.infer_state(str(problem_dir))
544+
state["tests_generated"] = True
545+
state["generated_test_count"] = 2
546+
state["tests_verified"] = True
547+
state["verify_signals"] = {"limit_semantics": {"executed": True, "passed": False}}
548+
module.save_state(str(problem_dir), state)
549+
payload = {
550+
"tool_name": "mcp__autocode__problem_pack_polygon",
551+
"tool_input": {"problem_dir": str(problem_dir)},
552+
}
553+
module.pre_tool(payload)
554+
captured = capsys.readouterr().out
555+
parsed = json.loads(captured)
556+
assert parsed["hookSpecificOutput"]["permissionDecision"] == "deny"
557+
assert "limit_semantics" in parsed["hookSpecificOutput"]["permissionDecisionReason"]
558+
559+
447560
def test_pre_tool_pack_denies_when_limit_ratio_below_gate(tmp_path, capsys):
448561
module = load_module()
449562
problem_dir = tmp_path / "problem"

0 commit comments

Comments
 (0)