Skip to content

Commit 3e8a7e0

Browse files
committed
Diagnostics.
1 parent 0149310 commit 3e8a7e0

1 file changed

Lines changed: 39 additions & 0 deletions

File tree

arc/scripts/pipe_worker.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,44 @@
4343

4444
logger = logging.getLogger('pipe_worker')
4545

46+
_diagnostics_logged = False
47+
48+
49+
def _log_node_diagnostics() -> None:
50+
"""Dump hostname, PBS/Slurm array context, PATH, PYTHONPATH, TMPDIR, and
51+
group membership on first task failure. A dead compute node otherwise
52+
leaves no trace of *which* node it was — the PBS `.e` file records the
53+
array index but not the hostname, and ``tracejob`` is admin-only on
54+
many sites. Logging once per worker process keeps the volume bounded
55+
when one bad node drains many tasks.
56+
"""
57+
global _diagnostics_logged
58+
if _diagnostics_logged:
59+
return
60+
_diagnostics_logged = True
61+
import socket
62+
import subprocess
63+
try:
64+
host = socket.gethostname()
65+
except Exception:
66+
host = 'unknown'
67+
logger.error('--- NODE DIAGNOSTICS (first task failure on this worker) ---')
68+
logger.error(f'hostname={host}')
69+
for k in ('PBS_JOBID', 'PBS_ARRAY_INDEX', 'PBS_O_WORKDIR',
70+
'SLURM_JOB_ID', 'SLURM_ARRAY_TASK_ID', 'SLURM_NODELIST'):
71+
v = os.environ.get(k)
72+
if v is not None:
73+
logger.error(f'{k}={v}')
74+
logger.error(f'PATH={os.environ.get("PATH", "")}')
75+
logger.error(f'PYTHONPATH={os.environ.get("PYTHONPATH", "")}')
76+
logger.error(f'TMPDIR={os.environ.get("TMPDIR", "")}')
77+
try:
78+
id_out = subprocess.run(['id'], capture_output=True, text=True, timeout=5).stdout.strip()
79+
logger.error(f'id={id_out}')
80+
except Exception as exc:
81+
logger.error(f'id=<failed: {exc}>')
82+
logger.error('--- END NODE DIAGNOSTICS ---')
83+
4684

4785
def setup_logging(log_path: str) -> None:
4886
"""Configure logging. Safe to call multiple times."""
@@ -202,6 +240,7 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord,
202240
failure_class = type(e).__name__
203241
ended_at = time.time()
204242
logger.error(f'Task {task_id} failed: {failure_class}: {e}')
243+
_log_node_diagnostics()
205244
if scratch_dir:
206245
_copy_outputs(scratch_dir, attempt_dir)
207246
result = locals().get('result') or _make_result_template(task_id, state.attempt_index, started_at)

0 commit comments

Comments
 (0)