Skip to content

Commit 260a9c7

Browse files
committed
B3
1 parent 88095e6 commit 260a9c7

2 files changed

Lines changed: 71 additions & 4 deletions

File tree

arc/scheduler.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4126,18 +4126,35 @@ def delete_all_species_jobs(self, label: str):
41264126
label (str): The species label.
41274127
"""
41284128
logger.debug(f'Deleting all jobs for species {label}')
4129+
4130+
def _safe_delete(job, display_name):
4131+
"""Best-effort job delete: log failure (e.g. queue rejected qdel)
4132+
and continue. This loop is cleanup before troubleshooting / restart;
4133+
an orphan remote job will exit on its own and must not abort the
4134+
whole scheduler. Without this guard, a single failed delete
4135+
propagates a RuntimeError up through delete_all_species_jobs →
4136+
troubleshoot_negative_freq → schedule_jobs → __init__ and the
4137+
entire project dies."""
4138+
logger.info(f'Deleted job {display_name}')
4139+
try:
4140+
job.delete()
4141+
except Exception as exc:
4142+
logger.warning(
4143+
f'Failed to delete job {display_name} for species '
4144+
f'{label}: {type(exc).__name__}: {exc}. Continuing — '
4145+
f'the orphan job (if any) will exit on its own.'
4146+
)
4147+
41294148
for value in self.job_dict[label].values():
41304149
if value in ['conf_opt', 'tsg']:
41314150
for job_name, job in self.job_dict[label][value].items():
41324151
if label in self.running_jobs.keys() and job_name in self.running_jobs[label] \
41334152
and job.execution_type != 'incore':
4134-
logger.info(f'Deleted job {value}{job_name}')
4135-
job.delete()
4153+
_safe_delete(job, f'{value}{job_name}')
41364154
for job_name, job in value.items():
41374155
if label in self.running_jobs.keys() and job_name in self.running_jobs[label] \
41384156
and job.execution_type != 'incore':
4139-
logger.info(f'Deleted job {job_name}')
4140-
job.delete()
4157+
_safe_delete(job, job_name)
41414158
self.running_jobs[label] = list()
41424159
# Reset paths for this species. Most keys reset to ''; container-valued
41434160
# keys keep their type so the rest of the pipeline (composite tracking,

arc/scheduler_test.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1703,6 +1703,56 @@ def test_delete_all_species_jobs_preserves_sp_composite_dict(self):
17031703
self._write_gaussian_fixture(delta_T_high_path, -1.15)
17041704
sched.post_sp_actions('H2', delta_T_high_path, protocol.corrections[0].high)
17051705

1706+
def test_delete_all_species_jobs_tolerates_one_failed_delete(self):
1707+
"""Regression: a single failed ``job.delete()`` (e.g. ``qdel`` couldn't
1708+
kill the job because the queue is unresponsive) must NOT abort the
1709+
whole scheduler. ``delete_all_species_jobs`` is best-effort cleanup —
1710+
an orphaned remote job will exit on its own. The other jobs still
1711+
need to be deleted, the species's state still needs to be reset, and
1712+
the scheduler must keep running."""
1713+
tmp = os.path.join(self.project_directory, "fx_delete_failure")
1714+
os.makedirs(tmp, exist_ok=True)
1715+
recipe = {"base": {"method": "hf", "basis": "cc-pVTZ"}, "corrections": []}
1716+
protocol = CompositeProtocol.from_user_input(recipe)
1717+
spc = ARCSpecies(label='H2', smiles='[H][H]')
1718+
spc.final_xyz = {'symbols': ('H', 'H'),
1719+
'coords': ((0, 0, 0), (0, 0, 0.74)),
1720+
'isotopes': (1, 1)}
1721+
sched = self._make_scheduler([spc], sp_composite=protocol)
1722+
1723+
class _StubJob:
1724+
def __init__(self, name, raise_on_delete=False):
1725+
self.name = name
1726+
self.execution_type = 'queue'
1727+
self.deleted = False
1728+
self.raise_on_delete = raise_on_delete
1729+
1730+
def delete(self):
1731+
if self.raise_on_delete:
1732+
raise RuntimeError(f'Could not delete job {self.name}')
1733+
self.deleted = True
1734+
1735+
bad = _StubJob('a4035060', raise_on_delete=True)
1736+
good_a = _StubJob('a4035061')
1737+
good_b = _StubJob('a4035062')
1738+
# job_dict is keyed [label][job_type][job_name → JobAdapter].
1739+
# The ordering puts the failing job in the middle so we verify both
1740+
# the deletes before AND after it still run.
1741+
sched.job_dict['H2'] = {'sp': {
1742+
'a4035061': good_a,
1743+
'a4035060': bad,
1744+
'a4035062': good_b,
1745+
}}
1746+
sched.running_jobs['H2'] = ['a4035061', 'a4035060', 'a4035062']
1747+
# Should not raise.
1748+
sched.delete_all_species_jobs('H2')
1749+
self.assertTrue(good_a.deleted, "Pre-failure delete must still run.")
1750+
self.assertTrue(good_b.deleted, "Post-failure delete must still run.")
1751+
# And the species's state still got reset (running_jobs cleared, paths
1752+
# rebuilt with sp_composite as a dict, etc.).
1753+
self.assertEqual(sched.running_jobs['H2'], [])
1754+
self.assertIsInstance(sched.output['H2']['paths']['sp_composite'], dict)
1755+
17061756
# --- Phase 3.5: preset name + reference preservation ------------------- #
17071757

17081758
def test_preset_name_and_reference_survive_to_notebook_section(self):

0 commit comments

Comments
 (0)