Skip to content

Commit e8e348c

Browse files
kvmtovedika-saravanan
authored andcommitted
Apply yapf to .claude python files
CI's PR sanity check (yapf) flagged 8 reformatting changes in the new skill-eval pipeline. Reformat in place using the repo's `scripts/run_yapf_format.sh` rules (`based_on_style = google`). Also gitignore `.claude/evals/workspaces/` so per-iteration eval artifacts (`responses.json`, `grading.*.json`, generated reports) don't land in commits. No functional changes; pipeline smoke-tested end-to-end after reformat (preflight, import_smoke, pick_workflow, runner, programmatic grader, aggregate, viewer). Signed-off-by: kvmto <kmato@nvidia.com>
1 parent 303b4e7 commit e8e348c

9 files changed

Lines changed: 433 additions & 209 deletions

File tree

.claude/evals/aggregate.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,8 @@ def _benchmark(configs: dict[str, dict[str, dict]]) -> dict[str, Any]:
113113
cfg_out[grader] = {
114114
"scenarios_total": len(scenarios),
115115
"scenarios_considered": len(considered),
116-
"pass_rate": (
117-
sum(considered) / len(considered) if considered else 0.0
118-
),
116+
"pass_rate":
117+
(sum(considered) / len(considered) if considered else 0.0),
119118
"raw": payload,
120119
}
121120
bench["configurations"][cfg_name] = cfg_out
@@ -140,25 +139,35 @@ def _agreement(configs: dict[str, dict[str, dict]]) -> dict[str, Any]:
140139

141140
grader_verdicts: dict[str, list[bool | None]] = {}
142141
for grader, payload in target.items():
143-
verdicts = [_grader_pass(grader, s) for s in payload.get("scenarios", [])]
142+
verdicts = [
143+
_grader_pass(grader, s) for s in payload.get("scenarios", [])
144+
]
144145
grader_verdicts[grader] = verdicts
145146

146147
pairs: dict[str, dict[str, Any]] = {}
147148
for a, b in combinations(grader_verdicts, 2):
148-
pairs[f"{a}{b}"] = _cohen_kappa(grader_verdicts[a], grader_verdicts[b])
149+
pairs[f"{a}{b}"] = _cohen_kappa(grader_verdicts[a],
150+
grader_verdicts[b])
149151
return {
150-
"configuration": "with_skill" if "with_skill" in configs else next(iter(configs)),
151-
"pairwise_kappa": pairs,
152+
"configuration":
153+
"with_skill" if "with_skill" in configs else next(iter(configs)),
154+
"pairwise_kappa":
155+
pairs,
152156
}
153157

154158

155159
def main() -> int:
156-
p = argparse.ArgumentParser(description=__doc__,
157-
formatter_class=argparse.RawDescriptionHelpFormatter)
158-
p.add_argument("iteration_dir", type=Path,
160+
p = argparse.ArgumentParser(
161+
description=__doc__,
162+
formatter_class=argparse.RawDescriptionHelpFormatter)
163+
p.add_argument("iteration_dir",
164+
type=Path,
159165
help="Path to the iteration directory.")
160-
p.add_argument("--out", type=Path, default=None,
161-
help="Output benchmark.json path. Default: <iteration>/benchmark.json")
166+
p.add_argument(
167+
"--out",
168+
type=Path,
169+
default=None,
170+
help="Output benchmark.json path. Default: <iteration>/benchmark.json")
162171
args = p.parse_args()
163172

164173
if not args.iteration_dir.is_dir():
@@ -190,8 +199,11 @@ def main() -> int:
190199
print(f" {g:<14} {d:+.0%}")
191200
if bench["inter_grader_agreement"].get("pairwise_kappa"):
192201
print()
193-
print(f"Inter-grader Cohen's κ ({bench['inter_grader_agreement']['configuration']}):")
194-
for pair, k in bench["inter_grader_agreement"]["pairwise_kappa"].items():
202+
print(
203+
f"Inter-grader Cohen's κ ({bench['inter_grader_agreement']['configuration']}):"
204+
)
205+
for pair, k in bench["inter_grader_agreement"]["pairwise_kappa"].items(
206+
):
195207
kv = k.get("kappa")
196208
ks = "n/a" if kv is None else f"{kv:+.2f}"
197209
print(f" {pair:<32} κ = {ks} (n={k['n']})")

.claude/evals/graders/executable.py

Lines changed: 47 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,21 @@ def _resolve_assertions(skill: str, override: Path | None) -> Path:
7272
if override is not None:
7373
return override
7474
if skill not in SKILL_DIRS:
75-
raise SystemExit(f"Unknown skill alias '{skill}'. Known: {sorted(SKILL_DIRS)}")
75+
raise SystemExit(
76+
f"Unknown skill alias '{skill}'. Known: {sorted(SKILL_DIRS)}")
7677
p = ASSERTIONS_DIR / f"{SKILL_DIRS[skill]}.json"
7778
if not p.exists():
7879
raise SystemExit(f"Assertions not found at {p}")
7980
return p
8081

8182

8283
_FENCE_PATTERNS = {
83-
"first_python_block": re.compile(r"```(?:python|py)\s*\n(.*?)\n```", re.DOTALL),
84-
"first_bash_block": re.compile(r"```(?:bash|sh|shell)\s*\n(.*?)\n```", re.DOTALL),
85-
"all_python_blocks": re.compile(r"```(?:python|py)\s*\n(.*?)\n```", re.DOTALL),
84+
"first_python_block":
85+
re.compile(r"```(?:python|py)\s*\n(.*?)\n```", re.DOTALL),
86+
"first_bash_block":
87+
re.compile(r"```(?:bash|sh|shell)\s*\n(.*?)\n```", re.DOTALL),
88+
"all_python_blocks":
89+
re.compile(r"```(?:python|py)\s*\n(.*?)\n```", re.DOTALL),
8690
}
8791

8892

@@ -108,30 +112,45 @@ def _module_present(mod: str) -> bool:
108112
def _run_scenario(scenario_id: str, spec: dict, response: str) -> dict:
109113
rules = spec.get("executable")
110114
if not rules:
111-
return {"id": scenario_id, "status": "skipped",
112-
"reason": "no executable block in assertions"}
115+
return {
116+
"id": scenario_id,
117+
"status": "skipped",
118+
"reason": "no executable block in assertions"
119+
}
113120

114121
# Skip if a required module is missing (saves time + noise).
115122
for needed in rules.get("skip_if_missing", []):
116123
if not _module_present(needed):
117-
return {"id": scenario_id, "status": "skipped",
118-
"reason": f"required module '{needed}' not installed"}
124+
return {
125+
"id": scenario_id,
126+
"status": "skipped",
127+
"reason": f"required module '{needed}' not installed"
128+
}
119129

120130
extractor = rules.get("code_extractor", "first_python_block")
121131
code = _extract_code(response, extractor)
122132
if code is None:
123-
return {"id": scenario_id, "status": "no_code",
124-
"reason": f"no `{extractor}` block found in response"}
133+
return {
134+
"id": scenario_id,
135+
"status": "no_code",
136+
"reason": f"no `{extractor}` block found in response"
137+
}
125138

126-
interp = rules.get("interpreter", "python3" if "python" in extractor else "bash")
139+
interp = rules.get("interpreter",
140+
"python3" if "python" in extractor else "bash")
127141
preamble = rules.get("preamble", "")
128142
harness = rules.get("harness", "{code}")
129-
program = harness.format(code=preamble + code) if "{code}" in harness else preamble + code
143+
program = harness.format(code=preamble +
144+
code) if "{code}" in harness else preamble + code
130145
timeout_s = float(rules.get("timeout_s", 30))
131146

132-
cmd = [interp, "-c", program] if interp != "bash" else ["bash", "-c", program]
147+
cmd = [interp, "-c", program
148+
] if interp != "bash" else ["bash", "-c", program]
133149
try:
134-
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_s)
150+
r = subprocess.run(cmd,
151+
capture_output=True,
152+
text=True,
153+
timeout=timeout_s)
135154
rc, out, err, timed_out = r.returncode, r.stdout, r.stderr, False
136155
except subprocess.TimeoutExpired:
137156
rc, out, err, timed_out = 124, "", f"timeout after {timeout_s}s", True
@@ -167,7 +186,9 @@ def grade(skill: str, responses: dict, assertions_path: Path) -> dict:
167186
for sid, spec in bench["scenarios"].items():
168187
scenarios.append(_run_scenario(sid, spec, responses.get(sid, "")))
169188

170-
counted = [s for s in scenarios if s["status"] in {"passed", "failed", "no_code"}]
189+
counted = [
190+
s for s in scenarios if s["status"] in {"passed", "failed", "no_code"}
191+
]
171192
passed = sum(1 for s in scenarios if s["status"] == "passed")
172193
failed = sum(1 for s in scenarios if s["status"] == "failed")
173194
no_code = sum(1 for s in scenarios if s["status"] == "no_code")
@@ -188,9 +209,11 @@ def grade(skill: str, responses: dict, assertions_path: Path) -> dict:
188209

189210

190211
def main() -> int:
191-
p = argparse.ArgumentParser(description=__doc__,
192-
formatter_class=argparse.RawDescriptionHelpFormatter)
193-
p.add_argument("--skill", required=True,
212+
p = argparse.ArgumentParser(
213+
description=__doc__,
214+
formatter_class=argparse.RawDescriptionHelpFormatter)
215+
p.add_argument("--skill",
216+
required=True,
194217
help=f"Skill alias. Known: {sorted(SKILL_DIRS)}")
195218
p.add_argument("--responses", type=Path, required=True)
196219
p.add_argument("--assertions", type=Path, default=None)
@@ -216,10 +239,14 @@ def main() -> int:
216239
print(f" Failed: {result['failed']}")
217240
print(f" No code: {result['no_code']}")
218241
print(f" Skipped: {result['skipped']}")
219-
print(f" Pass rate: {result['scenario_pass_rate']:.0%} (over passed+failed+no_code)")
242+
print(
243+
f" Pass rate: {result['scenario_pass_rate']:.0%} (over passed+failed+no_code)"
244+
)
220245
for s in result["scenarios"]:
221246
if s["status"] == "failed":
222-
print(f" [{s['id']}] FAILED exit={s['exit_code']}/{s['expected_exit']}")
247+
print(
248+
f" [{s['id']}] FAILED exit={s['exit_code']}/{s['expected_exit']}"
249+
)
223250
print(f" stderr_tail: {s['stderr_tail'][:200]}")
224251
print(f" Wrote: {out}")
225252
return 0

.claude/evals/graders/judge.py

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,11 @@
6666
}
6767

6868

69-
def _resolve_skill_files(skill: str, ass_override: Path | None) -> tuple[Path, Path]:
69+
def _resolve_skill_files(skill: str,
70+
ass_override: Path | None) -> tuple[Path, Path]:
7071
if skill not in SKILL_DIRS:
71-
raise SystemExit(f"Unknown skill alias '{skill}'. Known: {sorted(SKILL_DIRS)}")
72+
raise SystemExit(
73+
f"Unknown skill alias '{skill}'. Known: {sorted(SKILL_DIRS)}")
7274
full = SKILL_DIRS[skill]
7375
a = ass_override or (ASSERTIONS_DIR / f"{full}.json")
7476
p = PROMPTS_DIR / f"{full}.evals.json"
@@ -113,7 +115,6 @@ def _resolve_skill_files(skill: str, ass_override: Path | None) -> tuple[Path, P
113115
must_not_include: {must_not_include}
114116
"""
115117

116-
117118
# ---------------------------------------------------------------------------
118119
# Backends
119120
# ---------------------------------------------------------------------------
@@ -131,7 +132,10 @@ def _backend_openai(model: str) -> Callable[[str], str] | None:
131132
def call(prompt: str) -> str:
132133
resp = client.chat.completions.create(
133134
model=model,
134-
messages=[{"role": "user", "content": prompt}],
135+
messages=[{
136+
"role": "user",
137+
"content": prompt
138+
}],
135139
temperature=0,
136140
)
137141
return resp.choices[0].message.content or ""
@@ -153,7 +157,10 @@ def call(prompt: str) -> str:
153157
model=model,
154158
max_tokens=400,
155159
temperature=0,
156-
messages=[{"role": "user", "content": prompt}],
160+
messages=[{
161+
"role": "user",
162+
"content": prompt
163+
}],
157164
)
158165
text = ""
159166
for block in resp.content:
@@ -186,7 +193,8 @@ def _parse_judge_reply(text: str) -> dict[str, Any] | None:
186193
except (TypeError, ValueError):
187194
d[k] = 0
188195
d.setdefault("justification", "")
189-
d["total"] = sum(d[k] for k in ("correctness", "specificity", "coverage", "hallucinations"))
196+
d["total"] = sum(d[k] for k in ("correctness", "specificity", "coverage",
197+
"hallucinations"))
190198
return d
191199

192200

@@ -199,11 +207,12 @@ def grade(skill: str, prompts: list[dict], assertions: dict,
199207
spec = assertions["scenarios"].get(sid)
200208
if not spec:
201209
continue
202-
rendered = (RUBRIC
203-
.replace("{prompt}", entry["prompt"])
204-
.replace("{response}", responses.get(sid, "(no response)"))
205-
.replace("{must_include}", json.dumps(spec.get("must_include", [])))
206-
.replace("{must_not_include}", json.dumps(spec.get("must_not_include", []))))
210+
rendered = (RUBRIC.replace("{prompt}", entry["prompt"]).replace(
211+
"{response}", responses.get(sid, "(no response)")).replace(
212+
"{must_include}",
213+
json.dumps(spec.get("must_include", []))).replace(
214+
"{must_not_include}",
215+
json.dumps(spec.get("must_not_include", []))))
207216

208217
record = {"id": sid, "model": model}
209218
if call is None:
@@ -230,8 +239,8 @@ def grade(skill: str, prompts: list[dict], assertions: dict,
230239
scored.append(record)
231240

232241
have_scores = [r for r in scored if r["status"] == "scored"]
233-
avg = (sum(r["total"] for r in have_scores) / (len(have_scores) * 8)
234-
if have_scores else 0.0)
242+
avg = (sum(r["total"] for r in have_scores) /
243+
(len(have_scores) * 8) if have_scores else 0.0)
235244

236245
return {
237246
"grader": "judge",
@@ -245,19 +254,26 @@ def grade(skill: str, prompts: list[dict], assertions: dict,
245254

246255

247256
def main() -> int:
248-
p = argparse.ArgumentParser(description=__doc__,
249-
formatter_class=argparse.RawDescriptionHelpFormatter)
250-
p.add_argument("--skill", required=True, help=f"One of: {sorted(SKILL_DIRS)}")
257+
p = argparse.ArgumentParser(
258+
description=__doc__,
259+
formatter_class=argparse.RawDescriptionHelpFormatter)
260+
p.add_argument("--skill",
261+
required=True,
262+
help=f"One of: {sorted(SKILL_DIRS)}")
251263
p.add_argument("--responses", type=Path, required=True)
252264
p.add_argument("--assertions", type=Path, default=None)
253265
p.add_argument("--backend", choices=sorted(BACKENDS), default="anthropic")
254-
p.add_argument("--model", default="claude-opus-4-5",
255-
help="Pin a specific model. Recorded in the output for reproducibility.")
266+
p.add_argument(
267+
"--model",
268+
default="claude-opus-4-5",
269+
help="Pin a specific model. Recorded in the output for reproducibility."
270+
)
256271
p.add_argument("--out", type=Path, default=None)
257272
p.add_argument("--quiet", action="store_true")
258273
args = p.parse_args()
259274

260-
prompts_path, assertions_path = _resolve_skill_files(args.skill, args.assertions)
275+
prompts_path, assertions_path = _resolve_skill_files(
276+
args.skill, args.assertions)
261277
if not args.responses.exists():
262278
raise SystemExit(f"Responses file not found: {args.responses}")
263279
responses = json.loads(args.responses.read_text())
@@ -274,8 +290,12 @@ def main() -> int:
274290

275291
if not args.quiet:
276292
print(f"Skill: {result['skill']} judge={args.backend}/{args.model}")
277-
print(f" Scored: {result['scenarios_scored']}/{result['scenarios_total']}")
278-
print(f" Avg normalized score (0-1): {result['average_normalized_score']:.2f}")
293+
print(
294+
f" Scored: {result['scenarios_scored']}/{result['scenarios_total']}"
295+
)
296+
print(
297+
f" Avg normalized score (0-1): {result['average_normalized_score']:.2f}"
298+
)
279299
if result["scenarios_scored"] == 0:
280300
print(" (no scenarios scored — check 'reason' fields in the JSON)")
281301
print(f" Wrote: {out}")

0 commit comments

Comments
 (0)