trsh: reduce Orca cpu cores on repeated DLPNO memory error

alongd · claude · alongd · commit 4582a7836edd · 2026-05-30T22:37:08.000+03:00
When increasing total memory had already been attempted, ARC kept
resubmitting a near-identical Orca job and looped. Also reduce cpu
cores (raising memory per core) when memory was already tried, and
escalate the per-core target for the Orca 5.x "Insufficient job
memory" path that reports no explicit requirement.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/arc/job/trsh.py b/arc/job/trsh.py
@@ -1066,27 +1066,38 @@ def trsh_ess_job(label: str,
             raise TrshError(f'DLPNO methods are incompatible with single-electron species {label} in Orca. '
                             f'This should have been caught by the Scheduler before job submission.')
         elif 'Memory' in job_status['keywords']:
-            # Increase memory allocation.
+            # Increase the memory per cpu core.
             # job_status will be for example
             # `Error  (ORCA_SCF): Not enough memory available! Please increase MaxCore to more than: 289 MB`.
+            # Whether ARC already attempted to increase the memory for this job. If so, simply requesting more
+            # total memory tends to keep failing (e.g. DLPNO-CCSD(T) triples need a large per-core MaxCore rather
+            # than more total node memory), so ARC reduces the number of cpu cores to raise the memory per core
+            # instead of resubmitting a near-identical job (which previously caused an endless retry loop).
+            memory_increased_before = 'memory' in ess_trsh_methods
             if 'memory' not in ess_trsh_methods:
                 ess_trsh_methods.append('memory')
+            per_cpu_core_memory = np.ceil(memory_gb / cpu_cores * 1024)  # MB currently allocated per cpu core
             try:
-                # parse Orca's memory requirement in MB
+                # parse Orca's explicit per cpu core memory requirement in MB (e.g., Orca 4.2.x)
                 estimated_mem_per_core = float(job_status['error'].split()[-2])
             except ValueError:
-                estimated_mem_per_core = estimate_orca_mem_cpu_requirement(num_heavy_atoms=num_heavy_atoms,
+                # Orca did not report an explicit requirement (e.g. Orca 5.x 'Insufficient job memory.').
+                # Aim for at least double the per-core memory already given so that reducing the number of cpu
+                # cores meaningfully raises the memory per core; never go below ARC's heuristic estimate.
+                heuristic_mem_per_core = estimate_orca_mem_cpu_requirement(num_heavy_atoms=num_heavy_atoms,
                                                                            server=server,
                                                                            consider_server_limits=True)[1]/cpu_cores
+                estimated_mem_per_core = max(2.0 * per_cpu_core_memory, heuristic_mem_per_core)
             # round up to the next hundred
             estimated_mem_per_core = int(np.ceil(estimated_mem_per_core / 100.0)) * 100
-            if 'max_total_job_memory' in job_status['keywords']:
-                per_cpu_core_memory = np.ceil(memory_gb / cpu_cores * 1024)
+            if 'max_total_job_memory' in job_status['keywords'] or memory_increased_before:
+                reason = 'the job had already requested the maximum amount of available total node memory' \
+                    if 'max_total_job_memory' in job_status['keywords'] \
+                    else 'increasing the total job memory had already been attempted and the job still ran out of memory'
                 logger.info(f'The crashed Orca job {label} was ran with {cpu_cores} cpu cores and '
                             f'{per_cpu_core_memory} MB memory per cpu core. It requires at least '
-                            f'{estimated_mem_per_core} MB per cpu core. Since the job had already requested the '
-                            f'maximum amount of available total node memory, ARC will attempt to reduce the number '
-                            f'of cpu cores to increase memory per cpu core.')
+                            f'{estimated_mem_per_core} MB per cpu core. Since {reason}, ARC will attempt to reduce '
+                            f'the number of cpu cores to increase memory per cpu core.')
                 if 'cpu' not in ess_trsh_methods:
                     ess_trsh_methods.append('cpu')
                 cpu_cores = math.floor(cpu_cores * per_cpu_core_memory / estimated_mem_per_core) - 2  # be conservative
diff --git a/arc/job/trsh_test.py b/arc/job/trsh_test.py
@@ -738,8 +738,11 @@ def test_trsh_ess_job(self):
 
         # Test Orca
         # Orca: test 1
-        # Test troubleshooting insufficient memory issue
-        # Automatically increase memory provided not exceeding maximum available memory
+        # Test troubleshooting insufficient memory issue.
+        # When merely increasing total memory has already been attempted ('memory' is already in
+        # ess_trsh_methods), simply requesting more total memory keeps failing (and previously caused
+        # ARC to resubmit near-identical jobs in an endless loop). Instead, ARC reduces the number of
+        # cpu cores to raise the memory per core (Orca's MaxCore).
         label = 'test'
         level_of_theory = {'method': 'dlpno-ccsd(T)'}
         server = 'server1'
@@ -759,8 +762,9 @@ def test_trsh_ess_job(self):
                                                                        job_type, software, fine, memory_gb,
                                                                        num_heavy_atoms, cpu_cores, ess_trsh_methods)
         self.assertIn('memory', ess_trsh_methods)
-        self.assertEqual(cpu_cores, 32)
-        self.assertAlmostEqual(memory, 327)
+        self.assertIn('cpu', ess_trsh_methods)
+        self.assertEqual(cpu_cores, 22)
+        self.assertAlmostEqual(memory, 227)
 
         # Orca: test 2
         # Test troubleshooting insufficient memory issue
@@ -814,6 +818,36 @@ def test_trsh_ess_job(self):
         self.assertEqual(couldnt_trsh, True)
         self.assertLess(cpu_cores, 1)  # can't really run job with less than 1 cpu ^o^
 
+        # Orca: test 3b
+        # Regression test for the Orca 5.x DLPNO-CCSD(T) "out of memory in the triples" loop.
+        # In Orca 5.x the message is "Please increase MaxCore - Skipping calculation" with no explicit
+        # per-core requirement, so determine_ess_status returns 'Insufficient job memory.'. Increasing
+        # total memory was already attempted (ess_trsh_methods=['memory']) and the node is NOT at its
+        # memory ceiling (no 'max_total_job_memory' keyword). Previously ARC kept resubmitting a nearly
+        # identical job forever; instead it must reduce the number of cpu cores so that the memory per
+        # core (Orca's MaxCore) actually increases.
+        label = 'test'
+        level_of_theory = {'method': 'dlpno-ccsd(T)'}
+        server = 'server2'
+        job_type = 'sp'
+        software = 'orca'
+        fine = False
+        memory_gb = 37
+        cpu_cores = 16
+        num_heavy_atoms = 16
+        ess_trsh_methods = ['memory']
+        job_status = {'keywords': ['MDCI', 'Memory'], 'error': 'Insufficient job memory.'}
+        mem_per_core_before = memory_gb / cpu_cores
+        output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \
+            memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status,
+                                                                       job_type, software, fine, memory_gb,
+                                                                       num_heavy_atoms, cpu_cores, ess_trsh_methods)
+        self.assertIn('cpu', ess_trsh_methods)
+        self.assertFalse(couldnt_trsh)
+        self.assertEqual(cpu_cores, 5)  # cpu cores reduced (this breaks the endless retry loop)
+        self.assertAlmostEqual(memory, 29)
+        self.assertGreater(memory / cpu_cores, mem_per_core_before)  # memory per core increased
+
         # Orca: test 4
         # Test troubleshooting too many cpu cores
         # Automatically reduce cpu cores