Skip to content

Commit a09a027

Browse files
jayhesselberthclaudecmeesters
authored
fix: pipeline hangs when submitting from compute nodes (#450)
When running snakemake from within a SLURM job (e.g., an interactive session on a compute node), the pipeline would submit jobs but never detect their completion, hanging forever. The RemoteExecutor base class starts a status-checking daemon thread in `__init__` *before* `__post_init__` is called. The SLURM plugin's `warn_on_jobcontext()` in `__post_init__` would sleep 5 seconds and then delete SLURM environment variables, but by then the daemon thread had already started and would silently die after its first polling cycle. Fix: move the SLURM environment detection and cleanup into `__init__`, before `super().__init__()` starts the daemon thread. Remove the now unnecessary `warn_on_jobcontext()` method and its 5-second sleep. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Cleaner SLURM environment detection and immediate cleanup during executor startup, with an earlier warning when a SLURM job context is present to improve job submission reliability. * **Tests** * Test suite updated to align with the revised executor initialization and warning behavior; expectations remain unchanged. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Christian Meesters <cmeesters@users.noreply.github.com>
1 parent 8ba024b commit a09a027

2 files changed

Lines changed: 24 additions & 29 deletions

File tree

snakemake_executor_plugin_slurm/__init__.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -428,9 +428,22 @@ def _select_logdir(workflow):
428428
# Required:
429429
# Implementation of your executor
430430
class Executor(RemoteExecutor):
431+
def __init__(self, workflow, logger):
432+
# Must clean SLURM environment BEFORE super().__init__() starts
433+
# the wait_thread, otherwise the thread races with __post_init__
434+
# and silently dies, causing the pipeline to hang forever.
435+
if "SLURM_JOB_ID" in os.environ:
436+
logger.warning(
437+
"You are running snakemake in a SLURM job context. "
438+
"This is not recommended, as it may lead to unexpected "
439+
"behavior. "
440+
"If possible, please run Snakemake directly on the "
441+
"login node."
442+
)
443+
delete_slurm_environment()
444+
super().__init__(workflow, logger)
445+
431446
def __post_init__(self, test_mode: bool = False):
432-
# run check whether we are running in a SLURM job context
433-
self.warn_on_jobcontext()
434447
self.test_mode = test_mode
435448

436449
self.run_uuid = str(uuid.uuid4())
@@ -617,20 +630,6 @@ def clean_old_logs(self) -> None:
617630
f"Could not delete empty directories in {self.slurm_logdir}: {e}"
618631
)
619632

620-
def warn_on_jobcontext(self, done=None):
621-
if not done:
622-
if "SLURM_JOB_ID" in os.environ:
623-
self.logger.warning(
624-
"You are running snakemake in a SLURM job context. "
625-
"This is not recommended, as it may lead to unexpected "
626-
"behavior. "
627-
"If possible, please run Snakemake directly on the "
628-
"login node."
629-
)
630-
time.sleep(5)
631-
delete_slurm_environment()
632-
done = True
633-
634633
def additional_general_args(self):
635634
"""
636635
This function defines additional arguments to be
@@ -1451,14 +1450,12 @@ async def check_active_jobs(
14511450
)
14521451
elif status == "PREEMPTED" and not self._preemption_warning:
14531452
self._preemption_warning = True
1454-
self.logger.warning(
1455-
"""
1453+
self.logger.warning("""
14561454
===== A Job preemption occured! =====
14571455
Leave Snakemake running, if possible. Otherwise Snakemake
14581456
needs to restart this job upon a Snakemake restart.
14591457
1460-
We leave it to SLURM to resume your job(s)"""
1461-
)
1458+
We leave it to SLURM to resume your job(s)""")
14621459
yield j
14631460
elif status == "UNKNOWN":
14641461
# the job probably does not exist anymore, but 'sacct' did not work

tests/test_cli.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,17 @@ def _make_executor(jobname_prefix: str):
3434
def test_jobname_prefix_applied():
3535
executor = _make_executor("testprefix")
3636

37-
with patch.object(Executor, "warn_on_jobcontext", return_value=None):
38-
with patch(
39-
"snakemake_executor_plugin_slurm.uuid.uuid4",
40-
return_value=uuid.UUID("00000000-0000-0000-0000-000000000000"),
41-
):
42-
executor.__post_init__(test_mode=True)
37+
with patch(
38+
"snakemake_executor_plugin_slurm.uuid.uuid4",
39+
return_value=uuid.UUID("00000000-0000-0000-0000-000000000000"),
40+
):
41+
executor.__post_init__(test_mode=True)
4342

4443
assert executor.run_uuid == "testprefix_00000000-0000-0000-0000-000000000000"
4544

4645

4746
def test_jobname_prefix_validation():
4847
executor = _make_executor("bad!prefix")
4948

50-
with patch.object(Executor, "warn_on_jobcontext", return_value=None):
51-
with pytest.raises(WorkflowError, match="jobname_prefix"):
52-
executor.__post_init__(test_mode=True)
49+
with pytest.raises(WorkflowError, match="jobname_prefix"):
50+
executor.__post_init__(test_mode=True)

0 commit comments

Comments
 (0)