|
| 1 | +import json |
| 2 | +import math |
| 3 | +import os |
| 4 | +import sys |
| 5 | + |
| 6 | + |
| 7 | +def get_latest_subdir(work_dir): |
| 8 | + dirs = [ |
| 9 | + d |
| 10 | + for d in os.listdir(work_dir) |
| 11 | + if os.path.isdir(os.path.join(work_dir, d)) and len(d) == 14 and d.isdigit() |
| 12 | + ] |
| 13 | + if not dirs: |
| 14 | + return None |
| 15 | + latest = max(dirs, key=lambda d: os.path.getmtime(os.path.join(work_dir, d))) |
| 16 | + return os.path.join(work_dir, latest) |
| 17 | + |
| 18 | + |
| 19 | +def extract_value(file, center_step, metrics): |
| 20 | + window_steps = list(range(center_step + 1, center_step + 4)) |
| 21 | + want = frozenset(window_steps) |
| 22 | + by_step = {s: {m: [] for m in metrics} for s in window_steps} |
| 23 | + with open(file, encoding="utf-8") as f: |
| 24 | + for line in f: |
| 25 | + obj = json.loads(line) |
| 26 | + s = obj.get("step") |
| 27 | + if s not in want: |
| 28 | + continue |
| 29 | + row = by_step[s] |
| 30 | + for m in metrics: |
| 31 | + row[m].append(obj[m] if m in obj else None) |
| 32 | + return window_steps, by_step |
| 33 | + |
| 34 | + |
| 35 | +def verify_window(path, center_step, metrics): |
| 36 | + window_steps, by_step = extract_value(path, center_step, metrics) |
| 37 | + missing_steps = {} |
| 38 | + missing_keys = [] |
| 39 | + not_equal = {} |
| 40 | + |
| 41 | + for m in metrics: |
| 42 | + miss = [s for s in window_steps if not by_step[s][m]] |
| 43 | + if miss: |
| 44 | + missing_steps[m] = miss |
| 45 | + continue |
| 46 | + |
| 47 | + bad_step = None |
| 48 | + for s in window_steps: |
| 49 | + vals = by_step[s][m] |
| 50 | + if any(v is None for v in vals): |
| 51 | + bad_step = s |
| 52 | + break |
| 53 | + if bad_step is not None: |
| 54 | + missing_keys.append((m, bad_step)) |
| 55 | + continue |
| 56 | + |
| 57 | + for s in window_steps: |
| 58 | + vals = by_step[s][m] |
| 59 | + if len(vals) > 1: |
| 60 | + first = vals[0] |
| 61 | + if any(not math.isclose(v, first, rel_tol=1e-6, abs_tol=0.0) for v in vals[1:]): |
| 62 | + not_equal.setdefault(m, []).append((s, list(vals))) |
| 63 | + |
| 64 | + check_result = not (missing_steps or missing_keys or not_equal) |
| 65 | + if not check_result: |
| 66 | + if missing_steps: |
| 67 | + print("Missing step data (no records for this step):", file=sys.stderr) |
| 68 | + for m, steps in missing_steps.items(): |
| 69 | + print(f" {m}: step {steps}", file=sys.stderr) |
| 70 | + if missing_keys: |
| 71 | + print("Missing key (metric absent in tracker line, value is None):", file=sys.stderr) |
| 72 | + for m, s in missing_keys: |
| 73 | + print(f" {m}: step {s}", file=sys.stderr) |
| 74 | + if not_equal: |
| 75 | + print("Inconsistent metric values across duplicate records at the same step:", file=sys.stderr) |
| 76 | + for m, pairs in not_equal.items(): |
| 77 | + parts = [] |
| 78 | + for s, vals in pairs: |
| 79 | + parts.append(f"step {s}: {vals}") |
| 80 | + print(f" {m}: " + "; ".join(parts), file=sys.stderr) |
| 81 | + return check_result |
| 82 | + |
| 83 | +if __name__ == "__main__": |
| 84 | + base_dir = f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/{sys.argv[2]}/{sys.argv[3]}" |
| 85 | + real_dir = get_latest_subdir(base_dir) |
| 86 | + tracker = os.path.join(real_dir, "logs/exp_tracking/rank0/tracker.jsonl") |
| 87 | + center_step = int(sys.argv[4]) |
| 88 | + metrics = sys.argv[5].split(',') |
| 89 | + assert verify_window(tracker, center_step, metrics), "Resume validation failed, see the printed output for details" |
0 commit comments