|
22 | 22 | from arc.common import ARC_PATH, ARC_TESTING_PATH, almost_equal_coords_lists, initialize_job_types, read_yaml_file |
23 | 23 | from arc.job.adapters.common import default_incore_adapters, ts_adapters_by_rmg_family, ts_adapters_for_unknown_unimolecular |
24 | 24 | from arc.job.factory import job_factory |
| 25 | +from arc.job.zombie import ZOMBIE_GRACE_SECONDS |
25 | 26 | from arc.level import Level |
26 | 27 | from arc.level.protocol import CompositeProtocol |
27 | 28 | from arc.plotter import save_conformers_file |
@@ -2567,7 +2568,7 @@ def _make_sched(self): |
2567 | 2568 | return sched |
2568 | 2569 |
|
2569 | 2570 | def _stub_job(self, job_adapter='molpro', job_type='sp', execution_type='queue', |
2570 | | - initial_offset_seconds=7200, job_name='sp_a3177', job_id=12345): |
| 2571 | + initial_offset_seconds=ZOMBIE_GRACE_SECONDS + 3600, job_name='sp_a3177', job_id=12345): |
2571 | 2572 | job = SimpleNamespace( |
2572 | 2573 | job_name=job_name, job_type=job_type, job_id=job_id, |
2573 | 2574 | job_adapter=job_adapter, execution_type=execution_type, |
@@ -2615,7 +2616,7 @@ def test_healthy_job_not_killed(self): |
2615 | 2616 |
|
2616 | 2617 | def test_grace_period_blocks_zombie_check(self): |
2617 | 2618 | sched = self._make_sched() |
2618 | | - # Spawned 30 minutes ago — within the 1hr grace period. |
| 2619 | + # Spawned 30 minutes ago — within the grace period. |
2619 | 2620 | job = self._stub_job(initial_offset_seconds=1800) |
2620 | 2621 | self._install(sched, job) |
2621 | 2622 | sched._run_a_job = lambda *a, **kw: self.fail("must not act inside grace window") |
@@ -2663,6 +2664,31 @@ def test_cap_prevents_double_resubmit(self): |
2663 | 2664 | self.assertFalse(job.deleted) |
2664 | 2665 | self.assertIn(job.job_name, sched.running_jobs['H2']) |
2665 | 2666 |
|
| 2667 | + def test_second_zombie_pass_is_noop_after_one_kill(self): |
| 2668 | + """A second zombie detection for the same (species, job_type) after one |
| 2669 | + real kill-and-resubmit cycle must be a no-op: no second delete, no second |
| 2670 | + resubmission, and the 'leaving for manual intervention' branch is taken.""" |
| 2671 | + sched = self._make_sched() |
| 2672 | + job = self._stub_job() |
| 2673 | + self._install(sched, job) |
| 2674 | + run_calls = [] |
| 2675 | + sched._run_a_job = lambda job, label: run_calls.append((job.job_name, label)) |
| 2676 | + with patch('arc.job.zombie.output_mtime', return_value=None): |
| 2677 | + sched.check_for_zombie_jobs('H2') |
| 2678 | + self.assertTrue(job.deleted) |
| 2679 | + self.assertEqual(run_calls, [(job.job_name, 'H2')]) |
| 2680 | + self.assertEqual(sched._zombie_kills['H2'], {'sp'}) |
| 2681 | + # The resubmitted job wedges too: the queue reports it running again. |
| 2682 | + job.deleted = False |
| 2683 | + sched.running_jobs['H2'].append(job.job_name) |
| 2684 | + with self.assertLogs(logger='arc', level=logging.WARNING) as cm: |
| 2685 | + sched.check_for_zombie_jobs('H2') |
| 2686 | + self.assertEqual(run_calls, [(job.job_name, 'H2')]) |
| 2687 | + self.assertFalse(job.deleted) |
| 2688 | + self.assertEqual(sched._zombie_kills['H2'], {'sp'}) |
| 2689 | + self.assertIn(job.job_name, sched.running_jobs['H2']) |
| 2690 | + self.assertIn('manual intervention', '\n'.join(cm.output)) |
| 2691 | + |
2666 | 2692 | def test_cap_is_per_job_type(self): |
2667 | 2693 | sched = self._make_sched() |
2668 | 2694 | sched._zombie_kills['H2'] = {'sp'} # sp already used. |
|
0 commit comments