Skip to content

Commit 4582a78

Browse files
alongdclaude
andcommitted
trsh: reduce Orca cpu cores on repeated DLPNO memory error
When increasing total memory had already been attempted, ARC kept resubmitting a near-identical Orca job and looped. Also reduce cpu cores (raising memory per core) when memory was already tried, and escalate the per-core target for the Orca 5.x "Insufficient job memory" path that reports no explicit requirement. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 57b655b commit 4582a78

2 files changed

Lines changed: 57 additions & 12 deletions

File tree

arc/job/trsh.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,27 +1066,38 @@ def trsh_ess_job(label: str,
10661066
raise TrshError(f'DLPNO methods are incompatible with single-electron species {label} in Orca. '
10671067
f'This should have been caught by the Scheduler before job submission.')
10681068
elif 'Memory' in job_status['keywords']:
1069-
# Increase memory allocation.
1069+
# Increase the memory per cpu core.
10701070
# job_status will be for example
10711071
# `Error (ORCA_SCF): Not enough memory available! Please increase MaxCore to more than: 289 MB`.
1072+
# Whether ARC already attempted to increase the memory for this job. If so, simply requesting more
1073+
# total memory tends to keep failing (e.g. DLPNO-CCSD(T) triples need a large per-core MaxCore rather
1074+
# than more total node memory), so ARC reduces the number of cpu cores to raise the memory per core
1075+
# instead of resubmitting a near-identical job (which previously caused an endless retry loop).
1076+
memory_increased_before = 'memory' in ess_trsh_methods
10721077
if 'memory' not in ess_trsh_methods:
10731078
ess_trsh_methods.append('memory')
1079+
per_cpu_core_memory = np.ceil(memory_gb / cpu_cores * 1024) # MB currently allocated per cpu core
10741080
try:
1075-
# parse Orca's memory requirement in MB
1081+
# parse Orca's explicit per cpu core memory requirement in MB (e.g., Orca 4.2.x)
10761082
estimated_mem_per_core = float(job_status['error'].split()[-2])
10771083
except ValueError:
1078-
estimated_mem_per_core = estimate_orca_mem_cpu_requirement(num_heavy_atoms=num_heavy_atoms,
1084+
# Orca did not report an explicit requirement (e.g. Orca 5.x 'Insufficient job memory.').
1085+
# Aim for at least double the per-core memory already given so that reducing the number of cpu
1086+
# cores meaningfully raises the memory per core; never go below ARC's heuristic estimate.
1087+
heuristic_mem_per_core = estimate_orca_mem_cpu_requirement(num_heavy_atoms=num_heavy_atoms,
10791088
server=server,
10801089
consider_server_limits=True)[1]/cpu_cores
1090+
estimated_mem_per_core = max(2.0 * per_cpu_core_memory, heuristic_mem_per_core)
10811091
# round up to the next hundred
10821092
estimated_mem_per_core = int(np.ceil(estimated_mem_per_core / 100.0)) * 100
1083-
if 'max_total_job_memory' in job_status['keywords']:
1084-
per_cpu_core_memory = np.ceil(memory_gb / cpu_cores * 1024)
1093+
if 'max_total_job_memory' in job_status['keywords'] or memory_increased_before:
1094+
reason = 'the job had already requested the maximum amount of available total node memory' \
1095+
if 'max_total_job_memory' in job_status['keywords'] \
1096+
else 'increasing the total job memory had already been attempted and the job still ran out of memory'
10851097
logger.info(f'The crashed Orca job {label} was ran with {cpu_cores} cpu cores and '
10861098
f'{per_cpu_core_memory} MB memory per cpu core. It requires at least '
1087-
f'{estimated_mem_per_core} MB per cpu core. Since the job had already requested the '
1088-
f'maximum amount of available total node memory, ARC will attempt to reduce the number '
1089-
f'of cpu cores to increase memory per cpu core.')
1099+
f'{estimated_mem_per_core} MB per cpu core. Since {reason}, ARC will attempt to reduce '
1100+
f'the number of cpu cores to increase memory per cpu core.')
10901101
if 'cpu' not in ess_trsh_methods:
10911102
ess_trsh_methods.append('cpu')
10921103
cpu_cores = math.floor(cpu_cores * per_cpu_core_memory / estimated_mem_per_core) - 2 # be conservative

arc/job/trsh_test.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -738,8 +738,11 @@ def test_trsh_ess_job(self):
738738

739739
# Test Orca
740740
# Orca: test 1
741-
# Test troubleshooting insufficient memory issue
742-
# Automatically increase memory provided not exceeding maximum available memory
741+
# Test troubleshooting insufficient memory issue.
742+
# When merely increasing total memory has already been attempted ('memory' is already in
743+
# ess_trsh_methods), simply requesting more total memory keeps failing (and previously caused
744+
# ARC to resubmit near-identical jobs in an endless loop). Instead, ARC reduces the number of
745+
# cpu cores to raise the memory per core (Orca's MaxCore).
743746
label = 'test'
744747
level_of_theory = {'method': 'dlpno-ccsd(T)'}
745748
server = 'server1'
@@ -759,8 +762,9 @@ def test_trsh_ess_job(self):
759762
job_type, software, fine, memory_gb,
760763
num_heavy_atoms, cpu_cores, ess_trsh_methods)
761764
self.assertIn('memory', ess_trsh_methods)
762-
self.assertEqual(cpu_cores, 32)
763-
self.assertAlmostEqual(memory, 327)
765+
self.assertIn('cpu', ess_trsh_methods)
766+
self.assertEqual(cpu_cores, 22)
767+
self.assertAlmostEqual(memory, 227)
764768

765769
# Orca: test 2
766770
# Test troubleshooting insufficient memory issue
@@ -814,6 +818,36 @@ def test_trsh_ess_job(self):
814818
self.assertEqual(couldnt_trsh, True)
815819
self.assertLess(cpu_cores, 1) # can't really run job with less than 1 cpu ^o^
816820

821+
# Orca: test 3b
822+
# Regression test for the Orca 5.x DLPNO-CCSD(T) "out of memory in the triples" loop.
823+
# In Orca 5.x the message is "Please increase MaxCore - Skipping calculation" with no explicit
824+
# per-core requirement, so determine_ess_status returns 'Insufficient job memory.'. Increasing
825+
# total memory was already attempted (ess_trsh_methods=['memory']) and the node is NOT at its
826+
# memory ceiling (no 'max_total_job_memory' keyword). Previously ARC kept resubmitting a nearly
827+
# identical job forever; instead it must reduce the number of cpu cores so that the memory per
828+
# core (Orca's MaxCore) actually increases.
829+
label = 'test'
830+
level_of_theory = {'method': 'dlpno-ccsd(T)'}
831+
server = 'server2'
832+
job_type = 'sp'
833+
software = 'orca'
834+
fine = False
835+
memory_gb = 37
836+
cpu_cores = 16
837+
num_heavy_atoms = 16
838+
ess_trsh_methods = ['memory']
839+
job_status = {'keywords': ['MDCI', 'Memory'], 'error': 'Insufficient job memory.'}
840+
mem_per_core_before = memory_gb / cpu_cores
841+
output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \
842+
memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status,
843+
job_type, software, fine, memory_gb,
844+
num_heavy_atoms, cpu_cores, ess_trsh_methods)
845+
self.assertIn('cpu', ess_trsh_methods)
846+
self.assertFalse(couldnt_trsh)
847+
self.assertEqual(cpu_cores, 5) # cpu cores reduced (this breaks the endless retry loop)
848+
self.assertAlmostEqual(memory, 29)
849+
self.assertGreater(memory / cpu_cores, mem_per_core_before) # memory per core increased
850+
817851
# Orca: test 4
818852
# Test troubleshooting too many cpu cores
819853
# Automatically reduce cpu cores

0 commit comments

Comments
 (0)