@@ -163,37 +163,72 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
163163 _add_common_timing (items , t_handler0 )
164164 return _result (False , items )
165165
166+ # MEM) Environment & memory diagnostics (no torch/ultralytics)
167+ # ----------------------------
166168 if diag == "mem" :
167169 try :
168170 import platform
169171 import resource
172+ import sys
170173
174+ items .append (("MEM" , "env/memory diagnostics" ))
171175 items .append (("platform" , platform .platform ()))
172- items .append (("python" , platform .python_version ()))
176+ items .append (("python_version" , platform .python_version ()))
177+ items .append (("python_implementation" , platform .python_implementation ()))
173178 items .append (("pid" , str (os .getpid ())))
174179
175- # RSS (KB) on Linux from resource; on some platforms may differ
180+ # sys.path (truncate to avoid huge output)
176181 try :
177- rss_kb = resource .getrusage (resource .RUSAGE_SELF ).ru_maxrss
178- items .append (("ru_maxrss" , str (rss_kb )))
182+ sp = "\n " .join (sys .path [:20 ])
183+ items .append (("sys_path_head" , _escape_html (sp ).replace ("\n " , "<br>" )))
184+ except Exception as e :
185+ items .append (("sys_path_head_FAIL" , f"{ type (e ).__name__ } : { e } " ))
186+
187+ # ru_maxrss: max resident set size so far
188+ # On Linux: typically KB; on macOS: bytes. Platform is Linux here.
189+ try :
190+ rss = resource .getrusage (resource .RUSAGE_SELF ).ru_maxrss
191+ items .append (("ru_maxrss_raw" , str (rss )))
192+ # Best-effort human-friendly conversion assuming Linux KB.
193+ try :
194+ rss_mb = float (rss ) / 1024.0
195+ items .append (("ru_maxrss_mb_est" , f"{ rss_mb :.2f} " ))
196+ except Exception :
197+ pass
179198 except Exception as e :
180199 items .append (("ru_maxrss_FAIL" , f"{ type (e ).__name__ } : { e } " ))
181200
182- # cgroup memory limit (common in containers / Lambda)
183- candidates = [
184- "/sys/fs/cgroup/memory.max" , # cgroup v2
185- "/sys/fs/cgroup/memory/memory.limit_in_bytes" , # cgroup v1
201+ # cgroup memory limits / usage
202+ # We ALWAYS emit FOUND/NOT_FOUND for each path.
203+ cgroup_files = [
204+ # cgroup v2 common files
205+ "/sys/fs/cgroup/memory.max" ,
206+ "/sys/fs/cgroup/memory.high" ,
207+ "/sys/fs/cgroup/memory.current" ,
208+ "/sys/fs/cgroup/memory.swap.max" ,
209+ "/sys/fs/cgroup/cpu.max" ,
210+ # cgroup v1 common files
211+ "/sys/fs/cgroup/memory/memory.limit_in_bytes" ,
212+ "/sys/fs/cgroup/memory/memory.soft_limit_in_bytes" ,
213+ "/sys/fs/cgroup/memory/memory.usage_in_bytes" ,
214+ "/sys/fs/cgroup/memory/memory.max_usage_in_bytes" ,
186215 ]
187- for p in candidates :
216+
217+ for p in cgroup_files :
218+ key = f"cgroup:{ os .path .basename (p )} "
188219 if os .path .exists (p ):
189220 try :
190221 with open (p , "r" , encoding = "utf-8" ) as f :
191- items .append ((os .path .basename (p ), f .read ().strip ()))
222+ val = f .read ().strip ()
223+ items .append ((key , val ))
192224 except Exception as e :
193- items .append ((os .path .basename (p ) + "_FAIL" , f"{ type (e ).__name__ } : { e } " ))
225+ items .append ((key + "_READ_FAIL" , f"{ type (e ).__name__ } : { e } " ))
226+ else :
227+ items .append ((key , "NOT_FOUND" ))
194228
195229 _add_common_timing (items , t_handler0 )
196230 return _result (False , items )
231+
197232 except Exception as e :
198233 items .append (("MEM_FAIL" , f"{ type (e ).__name__ } : { e } " ))
199234 items .append (("TRACEBACK" , _escape_html (traceback .format_exc ()).replace ("\n " , "<br>" )))
@@ -232,6 +267,27 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
232267 items .append (("TRACEBACK" , _escape_html (traceback .format_exc ()).replace ("\n " , "<br>" )))
233268 _add_common_timing (items , t_handler0 )
234269 return _result (False , items )
270+
271+ if diag == "alloc" :
272+ try :
273+ step_mb = int (_pget (params , "alloc_step_mb" , 64 ))
274+ max_mb = int (_pget (params , "alloc_max_mb" , 1024 ))
275+ chunks = []
276+ allocated = 0
277+ while allocated + step_mb <= max_mb :
278+ chunks .append (bytearray (step_mb * 1024 * 1024 ))
279+ allocated += step_mb
280+ items .append (("alloc_mb" , str (allocated )))
281+ _add_common_timing (items , t_handler0 )
282+ items .append (("ALLOC_DONE" , f"{ allocated } MB" ))
283+ _add_common_timing (items , t_handler0 )
284+ return _result (False , items )
285+ except Exception as e :
286+ items .append (("ALLOC_FAIL" , f"{ type (e ).__name__ } : { e } " ))
287+ items .append (("TRACEBACK" , _escape_html (traceback .format_exc ()).replace ("\n " , "<br>" )))
288+ _add_common_timing (items , t_handler0 )
289+ return _result (False , items )
290+
235291 # ----------------------------
236292 # A) torch lazy import timing (CPU-only friendly)
237293 # ----------------------------
0 commit comments