Skip to content

Commit 710d428

Browse files
committed
Cam's solution
1 parent 848e834 commit 710d428

3 files changed

Lines changed: 98 additions & 39 deletions

File tree

.github/workflows/benchmark-tmpl.yml

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -172,16 +172,15 @@ jobs:
172172
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
173173
with:
174174
name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
175-
path: eval_out/${{ env.RESULT_FILENAME }}
175+
path: |
176+
SUMMARY.md
177+
meta_env.json
178+
results*.json
179+
if-no-files-found: ignore
176180

177181
- name: Cleanup eval outputs (post-upload)
178182
if: ${{ env.RUN_EVAL == 'true' }}
179183
run: |
180-
if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then
181-
echo "Removing eval dir: eval_out/${RESULT_FILENAME}"
182-
rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true
183-
fi
184-
# Also remove empty parent folder if present
185-
if [ -d "eval_out" ]; then
186-
rmdir eval_out 2>/dev/null || true
187-
fi
184+
rm -f SUMMARY.md meta_env.json || true
185+
# Remove any eval results JSONs that were moved into workspace
186+
rm -f results*.json || true

benchmarks/benchmark_lib.sh

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,28 @@ META
326326
fi
327327
fi
328328

329-
# Note: Per policy, eval outputs stay under /tmp only; do not copy to workspace.
329+
# Move eval artifacts into PWD (no new directories in workspace)
330+
if [ -f "${summary_md}" ]; then
331+
mv -f "${summary_md}" ./ || true
332+
fi
333+
if [ -f "${meta_json}" ]; then
334+
mv -f "${meta_json}" ./ || true
335+
fi
336+
if [ -d "${out_dir}" ]; then
337+
while IFS= read -r -d '' jf; do
338+
base=$(basename "$jf")
339+
if [ "$base" != "meta_env.json" ] && [ "$base" != "SUMMARY.md" ]; then
340+
mv -f "$jf" ./ || true
341+
fi
342+
done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null)
343+
fi
330344

331-
echo "Results saved to: ${summary_md}"
345+
# Best-effort cleanup of the temp directory
346+
if [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then
347+
rm -rf --one-file-system "${out_dir}" || rm -rf "${out_dir}" || true
348+
fi
349+
350+
echo "Moved eval artifacts to: $(pwd)"
332351
}
333352

334353
# ------------------------------
@@ -565,7 +584,7 @@ run_lighteval_eval() {
565584
local port="${PORT:-8888}"
566585
local task="${EVAL_TASK:-gsm8k}"
567586
local num_fewshot="${NUM_FEWSHOT:-5}"
568-
local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}"
587+
local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
569588
local max_samples=0
570589
local concurrent_requests=32
571590

@@ -611,6 +630,9 @@ run_lighteval_eval() {
611630
output_dir="/workspace/${results_dir}"
612631
fi
613632

633+
# Make output dir visible to append_lm_eval_summary
634+
export EVAL_RESULT_DIR="$output_dir"
635+
614636
set -x
615637
lighteval endpoint litellm \
616638
"${MODEL_ARGS}" \

utils/collect_eval_results.py

Lines changed: 65 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,29 @@
77

88

99
def find_eval_sets(root: Path) -> List[Path]:
10-
"""Return directories that contain a meta_env.json (one set per job)."""
10+
"""Return directories that contain a meta_env.json (one set per job).
11+
12+
New structure: each downloaded artifact is placed under
13+
eval_results/<artifact-name>/ with flat files inside, e.g.:
14+
- meta_env.json
15+
- SUMMARY.md
16+
- results_*.json
17+
18+
We first check immediate child directories for meta_env.json to avoid
19+
descending unnecessarily. If nothing is found (backward compatibility),
20+
fall back to recursive search.
21+
"""
1122
out: List[Path] = []
23+
# Prefer immediate children (one directory per artifact)
24+
try:
25+
for d in root.iterdir():
26+
if d.is_dir() and (d / 'meta_env.json').exists():
27+
out.append(d)
28+
except Exception:
29+
pass
30+
if out:
31+
return out
32+
# Fallback: recursive (legacy structure)
1233
for p in root.rglob('meta_env.json'):
1334
out.append(p.parent)
1435
return out
@@ -23,32 +44,49 @@ def load_json(path: Path) -> Optional[Dict[str, Any]]:
2344

2445

2546
def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
26-
"""Return (lm_eval_json, lighteval_json) if present (latest by mtime)."""
27-
lm: List[Tuple[float, Path]] = []
28-
le: List[Tuple[float, Path]] = []
29-
for p in d.rglob('*.json'):
30-
if p.name == 'meta_env.json':
31-
continue
32-
data = load_json(p)
33-
if not isinstance(data, dict):
34-
continue
35-
# Heuristics similar to utils/lm_eval_to_md.py
36-
if 'lm_eval_version' in data or 'pretty_env_info' in data:
37-
try:
38-
lm.append((p.stat().st_mtime, p))
39-
except Exception:
40-
lm.append((0, p))
41-
elif 'config_general' in data and 'results' in data:
42-
try:
43-
le.append((p.stat().st_mtime, p))
44-
except Exception:
45-
le.append((0, p))
46-
elif 'results' in data:
47-
# Fallback: treat as lm-eval JSON
48-
try:
49-
lm.append((p.stat().st_mtime, p))
50-
except Exception:
51-
lm.append((0, p))
47+
"""Return (lm_eval_json, lighteval_json) if present (latest by mtime).
48+
49+
New structure places result JSONs flat in the artifact directory. We
50+
first check only the immediate directory for JSONs, then fall back to
51+
recursive search for backward compatibility.
52+
"""
53+
def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]:
54+
lm: List[Tuple[float, Path]] = []
55+
le: List[Tuple[float, Path]] = []
56+
for p in paths:
57+
if p.name == 'meta_env.json':
58+
continue
59+
data = load_json(p)
60+
if not isinstance(data, dict):
61+
continue
62+
# Heuristics similar to utils/lm_eval_to_md.py
63+
if 'lm_eval_version' in data or 'pretty_env_info' in data:
64+
try:
65+
lm.append((p.stat().st_mtime, p))
66+
except Exception:
67+
lm.append((0, p))
68+
elif 'config_general' in data and 'results' in data:
69+
try:
70+
le.append((p.stat().st_mtime, p))
71+
except Exception:
72+
le.append((0, p))
73+
elif 'results' in data:
74+
# Fallback: treat as lm-eval JSON
75+
try:
76+
lm.append((p.stat().st_mtime, p))
77+
except Exception:
78+
lm.append((0, p))
79+
return lm, le
80+
81+
# 1) Prefer immediate JSONs (flat structure)
82+
immediate_jsons = list(d.glob('results*.json')) + [p for p in d.glob('*.json') if p.name != 'meta_env.json']
83+
lm, le = scan_jsons(immediate_jsons)
84+
85+
# 2) If nothing found, fallback to deep scan (legacy)
86+
if not lm and not le:
87+
deep_jsons = list(d.rglob('*.json'))
88+
lm, le = scan_jsons(deep_jsons)
89+
5290
lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None
5391
le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None
5492
return lm_path, le_path

0 commit comments

Comments
 (0)