Skip to content

Commit 8b5e586

Browse files
committed
Improve mem diag and add alloc
1 parent 8d25fce commit 8b5e586

1 file changed

Lines changed: 67 additions & 11 deletions

File tree

evaluation_function/evaluation.py

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -163,37 +163,72 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
163163
_add_common_timing(items, t_handler0)
164164
return _result(False, items)
165165

166+
# MEM) Environment & memory diagnostics (no torch/ultralytics)
167+
# ----------------------------
166168
if diag == "mem":
167169
try:
168170
import platform
169171
import resource
172+
import sys
170173

174+
items.append(("MEM", "env/memory diagnostics"))
171175
items.append(("platform", platform.platform()))
172-
items.append(("python", platform.python_version()))
176+
items.append(("python_version", platform.python_version()))
177+
items.append(("python_implementation", platform.python_implementation()))
173178
items.append(("pid", str(os.getpid())))
174179

175-
# RSS (KB) on Linux from resource; on some platforms may differ
180+
# sys.path (truncate to avoid huge output)
176181
try:
177-
rss_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
178-
items.append(("ru_maxrss", str(rss_kb)))
182+
sp = "\n".join(sys.path[:20])
183+
items.append(("sys_path_head", _escape_html(sp).replace("\n", "<br>")))
184+
except Exception as e:
185+
items.append(("sys_path_head_FAIL", f"{type(e).__name__}: {e}"))
186+
187+
# ru_maxrss: max resident set size so far
188+
# On Linux: typically KB; on macOS: bytes. Platform is Linux here.
189+
try:
190+
rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
191+
items.append(("ru_maxrss_raw", str(rss)))
192+
# Best-effort human-friendly conversion assuming Linux KB.
193+
try:
194+
rss_mb = float(rss) / 1024.0
195+
items.append(("ru_maxrss_mb_est", f"{rss_mb:.2f}"))
196+
except Exception:
197+
pass
179198
except Exception as e:
180199
items.append(("ru_maxrss_FAIL", f"{type(e).__name__}: {e}"))
181200

182-
# cgroup memory limit (common in containers / Lambda)
183-
candidates = [
184-
"/sys/fs/cgroup/memory.max", # cgroup v2
185-
"/sys/fs/cgroup/memory/memory.limit_in_bytes", # cgroup v1
201+
# cgroup memory limits / usage
202+
# We ALWAYS emit FOUND/NOT_FOUND for each path.
203+
cgroup_files = [
204+
# cgroup v2 common files
205+
"/sys/fs/cgroup/memory.max",
206+
"/sys/fs/cgroup/memory.high",
207+
"/sys/fs/cgroup/memory.current",
208+
"/sys/fs/cgroup/memory.swap.max",
209+
"/sys/fs/cgroup/cpu.max",
210+
# cgroup v1 common files
211+
"/sys/fs/cgroup/memory/memory.limit_in_bytes",
212+
"/sys/fs/cgroup/memory/memory.soft_limit_in_bytes",
213+
"/sys/fs/cgroup/memory/memory.usage_in_bytes",
214+
"/sys/fs/cgroup/memory/memory.max_usage_in_bytes",
186215
]
187-
for p in candidates:
216+
217+
for p in cgroup_files:
218+
key = f"cgroup:{os.path.basename(p)}"
188219
if os.path.exists(p):
189220
try:
190221
with open(p, "r", encoding="utf-8") as f:
191-
items.append((os.path.basename(p), f.read().strip()))
222+
val = f.read().strip()
223+
items.append((key, val))
192224
except Exception as e:
193-
items.append((os.path.basename(p) + "_FAIL", f"{type(e).__name__}: {e}"))
225+
items.append((key + "_READ_FAIL", f"{type(e).__name__}: {e}"))
226+
else:
227+
items.append((key, "NOT_FOUND"))
194228

195229
_add_common_timing(items, t_handler0)
196230
return _result(False, items)
231+
197232
except Exception as e:
198233
items.append(("MEM_FAIL", f"{type(e).__name__}: {e}"))
199234
items.append(("TRACEBACK", _escape_html(traceback.format_exc()).replace("\n", "<br>")))
@@ -232,6 +267,27 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
232267
items.append(("TRACEBACK", _escape_html(traceback.format_exc()).replace("\n", "<br>")))
233268
_add_common_timing(items, t_handler0)
234269
return _result(False, items)
270+
271+
if diag == "alloc":
272+
try:
273+
step_mb = int(_pget(params, "alloc_step_mb", 64))
274+
max_mb = int(_pget(params, "alloc_max_mb", 1024))
275+
chunks = []
276+
allocated = 0
277+
while allocated + step_mb <= max_mb:
278+
chunks.append(bytearray(step_mb * 1024 * 1024))
279+
allocated += step_mb
280+
items.append(("alloc_mb", str(allocated)))
281+
_add_common_timing(items, t_handler0)
282+
items.append(("ALLOC_DONE", f"{allocated}MB"))
283+
_add_common_timing(items, t_handler0)
284+
return _result(False, items)
285+
except Exception as e:
286+
items.append(("ALLOC_FAIL", f"{type(e).__name__}: {e}"))
287+
items.append(("TRACEBACK", _escape_html(traceback.format_exc()).replace("\n", "<br>")))
288+
_add_common_timing(items, t_handler0)
289+
return _result(False, items)
290+
235291
# ----------------------------
236292
# A) torch lazy import timing (CPU-only friendly)
237293
# ----------------------------

0 commit comments

Comments
 (0)