@@ -1703,6 +1703,56 @@ def test_delete_all_species_jobs_preserves_sp_composite_dict(self):
17031703 self ._write_gaussian_fixture (delta_T_high_path , - 1.15 )
17041704 sched .post_sp_actions ('H2' , delta_T_high_path , protocol .corrections [0 ].high )
17051705
1706+ def test_delete_all_species_jobs_tolerates_one_failed_delete (self ):
1707+ """Regression: a single failed ``job.delete()`` (e.g. ``qdel`` couldn't
1708+ kill the job because the queue is unresponsive) must NOT abort the
1709+ whole scheduler. ``delete_all_species_jobs`` is best-effort cleanup —
1710+ an orphaned remote job will exit on its own. The other jobs still
1711+ need to be deleted, the species's state still needs to be reset, and
1712+ the scheduler must keep running."""
1713+ tmp = os .path .join (self .project_directory , "fx_delete_failure" )
1714+ os .makedirs (tmp , exist_ok = True )
1715+ recipe = {"base" : {"method" : "hf" , "basis" : "cc-pVTZ" }, "corrections" : []}
1716+ protocol = CompositeProtocol .from_user_input (recipe )
1717+ spc = ARCSpecies (label = 'H2' , smiles = '[H][H]' )
1718+ spc .final_xyz = {'symbols' : ('H' , 'H' ),
1719+ 'coords' : ((0 , 0 , 0 ), (0 , 0 , 0.74 )),
1720+ 'isotopes' : (1 , 1 )}
1721+ sched = self ._make_scheduler ([spc ], sp_composite = protocol )
1722+
1723+ class _StubJob :
1724+ def __init__ (self , name , raise_on_delete = False ):
1725+ self .name = name
1726+ self .execution_type = 'queue'
1727+ self .deleted = False
1728+ self .raise_on_delete = raise_on_delete
1729+
1730+ def delete (self ):
1731+ if self .raise_on_delete :
1732+ raise RuntimeError (f'Could not delete job { self .name } ' )
1733+ self .deleted = True
1734+
1735+ bad = _StubJob ('a4035060' , raise_on_delete = True )
1736+ good_a = _StubJob ('a4035061' )
1737+ good_b = _StubJob ('a4035062' )
1738+ # job_dict is keyed [label][job_type][job_name → JobAdapter].
1739+ # The ordering puts the failing job in the middle so we verify both
1740+ # the deletes before AND after it still run.
1741+ sched .job_dict ['H2' ] = {'sp' : {
1742+ 'a4035061' : good_a ,
1743+ 'a4035060' : bad ,
1744+ 'a4035062' : good_b ,
1745+ }}
1746+ sched .running_jobs ['H2' ] = ['a4035061' , 'a4035060' , 'a4035062' ]
1747+ # Should not raise.
1748+ sched .delete_all_species_jobs ('H2' )
1749+ self .assertTrue (good_a .deleted , "Pre-failure delete must still run." )
1750+ self .assertTrue (good_b .deleted , "Post-failure delete must still run." )
1751+ # And the species's state still got reset (running_jobs cleared, paths
1752+ # rebuilt with sp_composite as a dict, etc.).
1753+ self .assertEqual (sched .running_jobs ['H2' ], [])
1754+ self .assertIsInstance (sched .output ['H2' ]['paths' ]['sp_composite' ], dict )
1755+
17061756 # --- Phase 3.5: preset name + reference preservation ------------------- #
17071757
17081758 def test_preset_name_and_reference_survive_to_notebook_section (self ):
0 commit comments