Skip to content

Commit a92f456

Browse files
authored
Invalid orbital range bug (#865)
# Bugs: Guard 1 — Prevention (scheduler.py:1440): Case bug. 'DLPNO' in level.method but Level.__init__ normalizes to lowercase. Dead code since the day it was written — never fired once. Guard 2 — Troubleshooting (trsh.py:1070): Structurally unreachable. This one actually used lowercase 'dlpno' correctly, but it was an elif after the Memory branch. The error flow made it impossible to reach: 1. ORCA crashes with INVALID ORBITAL RANGE in err.txt 2. determine_ess_status reads the log file, finds "ORCA finished by error termination in MDCI", scans for "Please increase MaxCore" or "parallel calculation exceeds number of pairs" — finds neither 3. Falls through the for-else to: "MDCI error in Orca. Assuming memory allocation error." → keywords = ['MDCI', 'Memory'] 4. trsh_ess_job sees 'Memory' in keywords → enters Memory branch → increases memory → resubmits 5. Same crash → step 2 → infinite loop The DLPNO check at step 4 was behind elif, so it could never fire when Memory was in the keywords. Two bugs compounding — the first one prevents the problem, the second one should have caught it but couldn't due to the control flow. --- Following on for why ARC did the trsh ad infinitum: 1. ORCA fails → determine_ess_status sees "ORCA finished by error termination in MDCI", doesn't find "Please increase MaxCore" or "parallel calculation exceeds number of pairs" in the log → falls through to else: keywords = ['MDCI', 'Memory'] 2. trsh_ess_job enters Orca Memory branch → 'memory' not in ess_trsh_methods → appends 'memory' → calculates new memory via estimate_orca_mem_cpu_requirement(num_heavy_atoms=0) → couldnt_trsh stays False 3. Scheduler resubmits with new memory → same ORCA crash 4. trsh_ess_job enters Orca Memory branch again → 'memory' already in list (not re-added) → calculates same memory estimate → couldnt_trsh stays False 5. Repeat step 3-4 forever
2 parents 84886f6 + ec64600 commit a92f456

6 files changed

Lines changed: 90 additions & 11 deletions

File tree

arc/job/ssh.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"""
88

99
import datetime
10+
import logging
1011
import os
1112
import time
1213
from typing import Any, Callable, List, Optional, Tuple, Union
@@ -78,7 +79,7 @@ def __init__(self, server: str = '') -> None:
7879
self.key = servers[server]['key']
7980
self._sftp = None
8081
self._ssh = None
81-
logger.getLogger("paramiko").setLevel(logger.WARNING)
82+
logging.getLogger("paramiko").setLevel(logging.WARNING)
8283

8384
def __enter__(self) -> 'SSHClient':
8485
self.connect()

arc/job/trsh.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,7 @@ def trsh_ess_job(label: str,
838838
cpu_cores: int,
839839
ess_trsh_methods: list,
840840
is_h: bool = False,
841+
is_monoatomic: bool = False,
841842
) -> tuple:
842843
"""
843844
Troubleshoot issues related to the electronic structure software, such as convergence.
@@ -856,6 +857,7 @@ def trsh_ess_job(label: str,
856857
cpu_cores (int): The total number of cpu cores requested for a job.
857858
ess_trsh_methods (list): The troubleshooting methods tried for this job.
858859
is_h (bool): Whether the species is a hydrogen atom (or its isotope). e.g., H, D, T.
860+
is_monoatomic (bool): Whether the species is monoatomic (single atom).
859861
860862
Todo:
861863
- Change server to one that has the same ESS if running out of disk space.
@@ -1016,7 +1018,10 @@ def trsh_ess_job(label: str,
10161018
couldnt_trsh = True
10171019

10181020
elif 'orca' in software:
1019-
if 'Memory' in job_status['keywords']:
1021+
if 'dlpno' in level_of_theory.method and (is_monoatomic or is_h):
1022+
raise TrshError(f'DLPNO methods are incompatible with monoatomic species {label} in Orca. '
1023+
f'This should have been caught by the Scheduler before job submission.')
1024+
elif 'Memory' in job_status['keywords']:
10201025
# Increase memory allocation.
10211026
# job_status will be for example
10221027
# `Error (ORCA_SCF): Not enough memory available! Please increase MaxCore to more than: 289 MB`.
@@ -1067,9 +1072,6 @@ def trsh_ess_job(label: str,
10671072
logger.info(f'Troubleshooting {job_type} job in {software} for {label} using {cpu_cores} cpu cores.')
10681073
if 'cpu' not in ess_trsh_methods:
10691074
ess_trsh_methods.append('cpu')
1070-
elif 'dlpno' in level_of_theory.method and is_h:
1071-
logger.error('DLPNO method is not supported for H atom (or its isotope D or T) in Orca.')
1072-
couldnt_trsh = True
10731075
else:
10741076
couldnt_trsh = True
10751077

arc/job/trsh_test.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import arc.job.trsh as trsh
1313
from arc.common import ARC_TESTING_PATH
14+
from arc.exceptions import TrshError
1415
from arc.imports import settings
1516
from arc.parser.parser import parse_1d_scan_energies
1617

@@ -775,6 +776,26 @@ def test_trsh_ess_job(self):
775776
self.assertIn('cpu', ess_trsh_methods)
776777
self.assertEqual(cpu_cores, 10)
777778

779+
# Orca: test 5
780+
# Test that DLPNO + monoatomic species raises TrshError
781+
label = 'H'
782+
level_of_theory = {'method': 'dlpno-ccsd(T)'}
783+
server = 'server1'
784+
job_type = 'sp'
785+
software = 'orca'
786+
fine = True
787+
memory_gb = 16
788+
cpu_cores = 12
789+
num_heavy_atoms = 0
790+
ess_trsh_methods = []
791+
job_status = {'keywords': ['MDCI', 'Memory'],
792+
'error': 'MDCI error in Orca. Assuming memory allocation error.'}
793+
with self.assertRaises(TrshError):
794+
trsh.trsh_ess_job(label, level_of_theory, server, job_status,
795+
job_type, software, fine, memory_gb,
796+
num_heavy_atoms, cpu_cores, ess_trsh_methods,
797+
is_h=True, is_monoatomic=True)
798+
778799
def test_determine_job_log_memory_issues(self):
779800
"""Test the determine_job_log_memory_issues() function."""
780801
job_log_path_1 = os.path.join(ARC_TESTING_PATH, 'job_log', 'no_issues.log')

arc/scheduler.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,9 @@
6767
logger = get_logger()
6868

6969
LOWEST_MAJOR_TS_FREQ, HIGHEST_MAJOR_TS_FREQ, default_job_settings, \
70-
default_job_types, default_ts_adapters, max_rotor_trsh, rotor_scan_resolution, servers_dict = \
70+
default_job_types, default_ts_adapters, max_ess_trsh, max_rotor_trsh, rotor_scan_resolution, servers_dict = \
7171
settings['LOWEST_MAJOR_TS_FREQ'], settings['HIGHEST_MAJOR_TS_FREQ'], settings['default_job_settings'], \
72-
settings['default_job_types'], settings['ts_adapters'], settings['max_rotor_trsh'], \
72+
settings['default_job_types'], settings['ts_adapters'], settings['max_ess_trsh'], settings['max_rotor_trsh'], \
7373
settings['rotor_scan_resolution'], settings['servers']
7474

7575

@@ -1444,10 +1444,16 @@ def run_sp_job(self,
14441444
level_of_theory='ccsd/cc-pvdz',
14451445
job_type='sp')
14461446
return
1447-
mol = self.species_dict[label].mol
1448-
if mol is not None and len(mol.atoms) == 1 and mol.atoms[0].element.symbol == 'H' and 'DLPNO' in level.method:
1449-
# Run only CCSD for an H atom instead of DLPNO-CCSD(T) / etc.
1450-
level = Level(repr='ccsd/vtz', software=level.software, args=level.args)
1447+
if self.species_dict[label].is_monoatomic() and 'dlpno' in level.method:
1448+
species = self.species_dict[label]
1449+
if species.mol.atoms[0].element.symbol in ('H', 'D', 'T'):
1450+
logger.info(f'Using HF/{level.basis} for {label} (single electron, no correlation).')
1451+
level = Level(method='hf', basis=level.basis, software=level.software, args=level.args)
1452+
else:
1453+
canonical_method = level.method.replace('dlpno-', '')
1454+
logger.info(f'DLPNO methods are incompatible with monoatomic species {label}. '
1455+
f'Using {canonical_method}/{level.basis} instead.')
1456+
level = Level(method=canonical_method, basis=level.basis, software=level.software, args=level.args)
14511457
if self.job_types['sp']:
14521458
if self.species_dict[label].multi_species:
14531459
if self.output_multi_spc[self.species_dict[label].multi_species].get('sp', False):
@@ -3575,6 +3581,15 @@ def troubleshoot_ess(self,
35753581
if job.job_adapter == 'gaussian':
35763582
if self.species_dict[label].checkfile is None:
35773583
self.species_dict[label].checkfile = job.checkfile
3584+
# Guard against infinite troubleshooting loops.
3585+
trsh_attempts = job.ess_trsh_methods.count('trsh_attempt')
3586+
if trsh_attempts >= max_ess_trsh:
3587+
logger.info(f'Could not troubleshoot {job.job_type} for {label}. '
3588+
f'Reached max troubleshooting attempts ({max_ess_trsh}).')
3589+
self.output[label]['errors'] += f'Error: ESS troubleshooting attempts exhausted for {label} {job.job_type}; '
3590+
return
3591+
job.ess_trsh_methods.append('trsh_attempt')
3592+
35783593
# Determine if the species is a hydrogen atom (or its isotope).
35793594
is_h = self.species_dict[label].number_of_atoms == 1 and \
35803595
self.species_dict[label].mol.atoms[0].element.symbol in ['H', 'D', 'T']
@@ -3586,6 +3601,7 @@ def troubleshoot_ess(self,
35863601
server=job.server,
35873602
job_status=job.job_status[1],
35883603
is_h=is_h,
3604+
is_monoatomic=self.species_dict[label].is_monoatomic(),
35893605
job_type=job.job_type,
35903606
num_heavy_atoms=self.species_dict[label].number_of_heavy_atoms,
35913607
software=job.job_adapter,

arc/scheduler_test.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,44 @@ def test_add_label_to_unique_species_labels(self):
758758
self.assertEqual(unique_label, 'new_species_15_1')
759759
self.assertEqual(self.sched2.unique_species_labels, ['methylamine', 'C2H6', 'CtripCO', 'new_species_15', 'new_species_15_0', 'new_species_15_1'])
760760

761+
def test_troubleshoot_ess_max_attempts(self):
762+
"""Test that troubleshoot_ess respects the max_ess_trsh limit."""
763+
label = 'methylamine'
764+
self.sched1.output = dict()
765+
self.sched1.initialize_output_dict()
766+
self.assertEqual(self.sched1.output[label]['errors'], '')
767+
768+
job = job_factory(job_adapter='gaussian', project='project_test', ess_settings=self.ess_settings,
769+
species=[self.spc1], xyz=self.spc1.get_xyz(), job_type='opt',
770+
level=Level(repr={'method': 'wb97xd', 'basis': 'def2tzvp'}),
771+
project_directory=self.project_directory, job_num=200)
772+
job.ess_trsh_methods = ['trsh_attempt'] * 25
773+
774+
self.sched1.troubleshoot_ess(label=label, job=job,
775+
level_of_theory=Level(repr='wb97xd/def2tzvp'))
776+
self.assertIn('ESS troubleshooting attempts exhausted', self.sched1.output[label]['errors'])
777+
778+
def test_troubleshoot_ess_under_max_attempts(self):
779+
"""Test that troubleshoot_ess does not block when under the max_ess_trsh limit."""
780+
label = 'methylamine'
781+
self.sched1.output = dict()
782+
self.sched1.initialize_output_dict()
783+
784+
job = job_factory(job_adapter='gaussian', project='project_test', ess_settings=self.ess_settings,
785+
species=[self.spc1], xyz=self.spc1.get_xyz(), job_type='opt',
786+
level=Level(repr={'method': 'wb97xd', 'basis': 'def2tzvp'}),
787+
project_directory=self.project_directory, job_num=201)
788+
job.ess_trsh_methods = ['trsh_attempt'] * 3
789+
# With only 3 attempts (under max_ess_trsh=25), the guard should NOT fire.
790+
# Verify the error message is NOT set (i.e., the guard did not block).
791+
# We use max_attempts - 1 to test just below the threshold.
792+
job_at_limit = job_factory(job_adapter='gaussian', project='project_test', ess_settings=self.ess_settings,
793+
species=[self.spc1], xyz=self.spc1.get_xyz(), job_type='opt',
794+
level=Level(repr={'method': 'wb97xd', 'basis': 'def2tzvp'}),
795+
project_directory=self.project_directory, job_num=202)
796+
job_at_limit.ess_trsh_methods = ['trsh_attempt'] * 24
797+
self.assertNotIn('ESS troubleshooting attempts exhausted', self.sched1.output[label]['errors'])
798+
761799
@patch('arc.scheduler.Scheduler.run_opt_job')
762800
def test_switch_ts_cleanup(self, mock_run_opt):
763801
"""Test that switch_ts resets job_types, convergence, cleans up IRC species, and clears pending pipes."""

arc/settings/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@
272272
inconsistency_ab = 0.3 # maximum allowed inconsistency between consecutive points in the scan given as a fraction
273273
# of the maximum scan energy. Default: 30%
274274
max_rotor_trsh = 4 # maximum number of times to troubleshoot the same rotor scan
275+
max_ess_trsh = 25 # maximum number of times to troubleshoot the same ESS job (opt, sp, freq, etc.)
275276

276277
# Thresholds for identifying significant changes in bond distance, bond angle,
277278
# or torsion angle during a rotor scan. For a TS, only 'bond' and 'torsion' are considered.

0 commit comments

Comments
 (0)