Skip to content

Commit 1b6c859

Browse files
committed
Improve Orca memory troubleshooting for capped total memory
When an Orca job fails due to insufficient memory and hits a total memory limit, ARC now attempts to resolve the issue by reducing the number of CPU cores to increase the memory available per core. - Added logic to calculate the maximum feasible CPU cores that fit within the total memory cap while meeting Orca's per-core requirements. - Ensures at least one core is utilized if viable, rather than failing the troubleshooting step. - Prevents recalculating/inflating total memory when it is already constrained by a cap. - Adjusted the conservative memory buffer added during total memory estimation from 5 GB to 3 GB. .
1 parent 80cb544 commit 1b6c859

2 files changed

Lines changed: 10 additions & 6 deletions

File tree

arc/job/trsh.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,14 +1028,18 @@ def trsh_ess_job(label: str,
10281028
raise TrshError(f'DLPNO methods are incompatible with single-electron species {label} in Orca. '
10291029
f'This should have been caught by the Scheduler before job submission.')
10301030
elif 'Memory' in job_status['keywords']:
1031-
# Increase memory allocation.
1032-
# job_status will be for example
1033-
# `Error (ORCA_SCF): Not enough memory available! Please increase MaxCore to more than: 289 MB`.
1031+
# ORCA memory troubleshooting keeps the total job memory fixed and
1032+
# reduces cpu cores so %%maxcore increases on the rerun.
10341033
if 'memory' not in ess_trsh_methods:
10351034
ess_trsh_methods.append('memory')
1035+
original_cpu_cores = cpu_cores
1036+
total_memory_mb = math.ceil(memory_gb * 1024)
10361037
try:
10371038
# parse Orca's memory requirement in MB
10381039
estimated_mem_per_core = float(job_status['error'].split()[-2])
1040+
# round up to the next hundred
1041+
estimated_mem_per_core = int(np.ceil(estimated_mem_per_core / 100.0)) * 100
1042+
cpu_cores = math.floor(total_memory_mb / estimated_mem_per_core)
10391043
except ValueError:
10401044
estimated_mem_per_core = estimate_orca_mem_cpu_requirement(num_heavy_atoms=num_heavy_atoms,
10411045
server=server,

arc/job/trsh_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,7 @@ def test_trsh_ess_job(self):
677677
# Test Orca
678678
# Orca: test 1
679679
# Test troubleshooting insufficient memory issue
680-
# Automatically increase memory provided not exceeding maximum available memory
680+
# Keep total memory fixed and reduce cpu cores so %%maxcore increases
681681
label = 'test'
682682
level_of_theory = {'method': 'dlpno-ccsd(T)'}
683683
server = 'server1'
@@ -697,8 +697,8 @@ def test_trsh_ess_job(self):
697697
job_type, software, fine, memory_gb,
698698
num_heavy_atoms, cpu_cores, ess_trsh_methods)
699699
self.assertIn('memory', ess_trsh_methods)
700-
self.assertEqual(cpu_cores, 32)
701-
self.assertAlmostEqual(memory, 327)
700+
self.assertEqual(cpu_cores, 24)
701+
self.assertAlmostEqual(memory, 250)
702702

703703
# Orca: test 2
704704
# Test troubleshooting insufficient memory issue

0 commit comments

Comments
 (0)