Skip to content

Commit 1eb9430

Browse files
alongdclaude
andcommitted
tests: zombie kill-cap regression + align stubs with 6h grace
Add a two-pass regression test asserting the second zombie detection for the same (species, job_type) is a no-op after one kill+resubmit. Derive stub-job offsets from ZOMBIE_GRACE_SECONDS, which commit 9fdb8b9 raised to 6h without updating the tests. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent 6c433bd commit 1eb9430

2 files changed

Lines changed: 31 additions & 4 deletions

File tree

arc/job/zombie_test.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515

1616
def _stub_job(job_adapter='molpro', job_type='sp', execution_type='queue',
17-
initial_offset_seconds=7200, job_name='sp_a3177', job_id=12345,
17+
initial_offset_seconds=zombie.ZOMBIE_GRACE_SECONDS + 3600,
18+
job_name='sp_a3177', job_id=12345,
1819
server='server1', remote_path='/remote/no/such/path',
1920
local_path='/tmp/no/such/path',
2021
local_path_to_output_file='/tmp/no/such/output.out'):
@@ -36,7 +37,7 @@ def test_periodic_writers_set(self):
3637
)
3738

3839
def test_grace_period_default(self):
39-
self.assertEqual(zombie.ZOMBIE_GRACE_SECONDS, 3600)
40+
self.assertEqual(zombie.ZOMBIE_GRACE_SECONDS, 21600)
4041

4142

4243
class TestIsZombie(unittest.TestCase):

arc/scheduler_test.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from arc.common import ARC_PATH, ARC_TESTING_PATH, almost_equal_coords_lists, initialize_job_types, read_yaml_file
2323
from arc.job.adapters.common import default_incore_adapters, ts_adapters_by_rmg_family, ts_adapters_for_unknown_unimolecular
2424
from arc.job.factory import job_factory
25+
from arc.job.zombie import ZOMBIE_GRACE_SECONDS
2526
from arc.level import Level
2627
from arc.level.protocol import CompositeProtocol
2728
from arc.plotter import save_conformers_file
@@ -2567,7 +2568,7 @@ def _make_sched(self):
25672568
return sched
25682569

25692570
def _stub_job(self, job_adapter='molpro', job_type='sp', execution_type='queue',
2570-
initial_offset_seconds=7200, job_name='sp_a3177', job_id=12345):
2571+
initial_offset_seconds=ZOMBIE_GRACE_SECONDS + 3600, job_name='sp_a3177', job_id=12345):
25712572
job = SimpleNamespace(
25722573
job_name=job_name, job_type=job_type, job_id=job_id,
25732574
job_adapter=job_adapter, execution_type=execution_type,
@@ -2615,7 +2616,7 @@ def test_healthy_job_not_killed(self):
26152616

26162617
def test_grace_period_blocks_zombie_check(self):
26172618
sched = self._make_sched()
2618-
# Spawned 30 minutes ago — within the 1hr grace period.
2619+
# Spawned 30 minutes ago — within the grace period.
26192620
job = self._stub_job(initial_offset_seconds=1800)
26202621
self._install(sched, job)
26212622
sched._run_a_job = lambda *a, **kw: self.fail("must not act inside grace window")
@@ -2663,6 +2664,31 @@ def test_cap_prevents_double_resubmit(self):
26632664
self.assertFalse(job.deleted)
26642665
self.assertIn(job.job_name, sched.running_jobs['H2'])
26652666

2667+
def test_second_zombie_pass_is_noop_after_one_kill(self):
2668+
"""A second zombie detection for the same (species, job_type) after one
2669+
real kill-and-resubmit cycle must be a no-op: no second delete, no second
2670+
resubmission, and the 'leaving for manual intervention' branch is taken."""
2671+
sched = self._make_sched()
2672+
job = self._stub_job()
2673+
self._install(sched, job)
2674+
run_calls = []
2675+
sched._run_a_job = lambda job, label: run_calls.append((job.job_name, label))
2676+
with patch('arc.job.zombie.output_mtime', return_value=None):
2677+
sched.check_for_zombie_jobs('H2')
2678+
self.assertTrue(job.deleted)
2679+
self.assertEqual(run_calls, [(job.job_name, 'H2')])
2680+
self.assertEqual(sched._zombie_kills['H2'], {'sp'})
2681+
# The resubmitted job wedges too: the queue reports it running again.
2682+
job.deleted = False
2683+
sched.running_jobs['H2'].append(job.job_name)
2684+
with self.assertLogs(logger='arc', level=logging.WARNING) as cm:
2685+
sched.check_for_zombie_jobs('H2')
2686+
self.assertEqual(run_calls, [(job.job_name, 'H2')])
2687+
self.assertFalse(job.deleted)
2688+
self.assertEqual(sched._zombie_kills['H2'], {'sp'})
2689+
self.assertIn(job.job_name, sched.running_jobs['H2'])
2690+
self.assertIn('manual intervention', '\n'.join(cm.output))
2691+
26662692
def test_cap_is_per_job_type(self):
26672693
sched = self._make_sched()
26682694
sched._zombie_kills['H2'] = {'sp'} # sp already used.

0 commit comments

Comments
 (0)