Cam's solution

Oseltamivir · Oseltamivir · commit 710d4280a0ac · 2025-12-03T01:12:29.000+08:00
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -172,16 +172,15 @@ jobs:
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
-          path: eval_out/${{ env.RESULT_FILENAME }}
+          path: |
+            SUMMARY.md
+            meta_env.json
+            results*.json
+          if-no-files-found: ignore
 
       - name: Cleanup eval outputs (post-upload)
         if: ${{ env.RUN_EVAL == 'true' }}
         run: |
-          if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then
-            echo "Removing eval dir: eval_out/${RESULT_FILENAME}"
-            rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true
-          fi
-          # Also remove empty parent folder if present
-          if [ -d "eval_out" ]; then
-            rmdir eval_out 2>/dev/null || true
-          fi
+          rm -f SUMMARY.md meta_env.json || true
+          # Remove any eval results JSONs that were moved into workspace
+          rm -f results*.json || true
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -326,9 +326,28 @@ META
         fi
     fi
 
-    # Note: Per policy, eval outputs stay under /tmp only; do not copy to workspace.
+    # Move eval artifacts into PWD (no new directories in workspace)
+    if [ -f "${summary_md}" ]; then
+        mv -f "${summary_md}" ./ || true
+    fi
+    if [ -f "${meta_json}" ]; then
+        mv -f "${meta_json}" ./ || true
+    fi
+    if [ -d "${out_dir}" ]; then
+        while IFS= read -r -d '' jf; do
+            base=$(basename "$jf")
+            if [ "$base" != "meta_env.json" ] && [ "$base" != "SUMMARY.md" ]; then
+                mv -f "$jf" ./ || true
+            fi
+        done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null)
+    fi
 
-    echo "Results saved to: ${summary_md}"
+    # Best-effort cleanup of the temp directory
+    if [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then
+        rm -rf --one-file-system "${out_dir}" || rm -rf "${out_dir}" || true
+    fi
+
+    echo "Moved eval artifacts to: $(pwd)"
 }
 
 # ------------------------------
@@ -565,7 +584,7 @@ run_lighteval_eval() {
     local port="${PORT:-8888}"
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-5}"
-    local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}"
+    local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
     local max_samples=0
     local concurrent_requests=32
 
@@ -611,6 +630,9 @@ run_lighteval_eval() {
         output_dir="/workspace/${results_dir}"
     fi
 
+    # Make output dir visible to append_lm_eval_summary
+    export EVAL_RESULT_DIR="$output_dir"
+
     set -x
     lighteval endpoint litellm \
         "${MODEL_ARGS}" \
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
@@ -7,8 +7,29 @@
 
 
 def find_eval_sets(root: Path) -> List[Path]:
-    """Return directories that contain a meta_env.json (one set per job)."""
+    """Return directories that contain a meta_env.json (one set per job).
+
+    New structure: each downloaded artifact is placed under
+    eval_results/<artifact-name>/ with flat files inside, e.g.:
+      - meta_env.json
+      - SUMMARY.md
+      - results_*.json
+
+    We first check immediate child directories for meta_env.json to avoid
+    descending unnecessarily. If nothing is found (backward compatibility),
+    fall back to recursive search.
+    """
     out: List[Path] = []
+    # Prefer immediate children (one directory per artifact)
+    try:
+        for d in root.iterdir():
+            if d.is_dir() and (d / 'meta_env.json').exists():
+                out.append(d)
+    except Exception:
+        pass
+    if out:
+        return out
+    # Fallback: recursive (legacy structure)
     for p in root.rglob('meta_env.json'):
         out.append(p.parent)
     return out
@@ -23,32 +44,49 @@ def load_json(path: Path) -> Optional[Dict[str, Any]]:
 
 
 def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
-    """Return (lm_eval_json, lighteval_json) if present (latest by mtime)."""
-    lm: List[Tuple[float, Path]] = []
-    le: List[Tuple[float, Path]] = []
-    for p in d.rglob('*.json'):
-        if p.name == 'meta_env.json':
-            continue
-        data = load_json(p)
-        if not isinstance(data, dict):
-            continue
-        # Heuristics similar to utils/lm_eval_to_md.py
-        if 'lm_eval_version' in data or 'pretty_env_info' in data:
-            try:
-                lm.append((p.stat().st_mtime, p))
-            except Exception:
-                lm.append((0, p))
-        elif 'config_general' in data and 'results' in data:
-            try:
-                le.append((p.stat().st_mtime, p))
-            except Exception:
-                le.append((0, p))
-        elif 'results' in data:
-            # Fallback: treat as lm-eval JSON
-            try:
-                lm.append((p.stat().st_mtime, p))
-            except Exception:
-                lm.append((0, p))
+    """Return (lm_eval_json, lighteval_json) if present (latest by mtime).
+
+    New structure places result JSONs flat in the artifact directory. We
+    first check only the immediate directory for JSONs, then fall back to
+    recursive search for backward compatibility.
+    """
+    def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]:
+        lm: List[Tuple[float, Path]] = []
+        le: List[Tuple[float, Path]] = []
+        for p in paths:
+            if p.name == 'meta_env.json':
+                continue
+            data = load_json(p)
+            if not isinstance(data, dict):
+                continue
+            # Heuristics similar to utils/lm_eval_to_md.py
+            if 'lm_eval_version' in data or 'pretty_env_info' in data:
+                try:
+                    lm.append((p.stat().st_mtime, p))
+                except Exception:
+                    lm.append((0, p))
+            elif 'config_general' in data and 'results' in data:
+                try:
+                    le.append((p.stat().st_mtime, p))
+                except Exception:
+                    le.append((0, p))
+            elif 'results' in data:
+                # Fallback: treat as lm-eval JSON
+                try:
+                    lm.append((p.stat().st_mtime, p))
+                except Exception:
+                    lm.append((0, p))
+        return lm, le
+
+    # 1) Prefer immediate JSONs (flat structure)
+    immediate_jsons = list(d.glob('results*.json')) + [p for p in d.glob('*.json') if p.name != 'meta_env.json']
+    lm, le = scan_jsons(immediate_jsons)
+
+    # 2) If nothing found, fallback to deep scan (legacy)
+    if not lm and not le:
+        deep_jsons = list(d.rglob('*.json'))
+        lm, le = scan_jsons(deep_jsons)
+
     lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None
     le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None
     return lm_path, le_path