Skip to content

Commit 32a8ea1

Browse files
authored
Improved the xTB parser (#856)
2 parents 5c50f88 + 960197e commit 32a8ea1

4 files changed

Lines changed: 155 additions & 54 deletions

File tree

arc/constants.pxd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, E_h, F, E_h_kJmol, bohr_to_angstrom
1+
cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, bohr_to_angstrom, E_h, F, E_h_kJmol

arc/constants.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
#: The Bohr radius :math:`a_0` in :math:`\mathrm{m}`
4343
a0 = 5.2917721092e-11
44+
bohr_to_angstrom = a0 * 1e10
4445

4546
#: The atomic mass unit in :math:`\mathrm{kg}`
4647
amu = 1.660538921e-27
@@ -78,8 +79,6 @@
7879
#: Vacuum permittivity
7980
epsilon_0 = 8.8541878128
8081

81-
bohr_to_angstrom = 0.529177
82-
8382
# Cython does not automatically place module-level variables into the module
8483
# symbol table when in compiled mode, so we must do this manually so that we
8584
# can use the constants from both Cython and regular Python code

arc/parser/adapters/xtb.py

Lines changed: 66 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pandas as pd
1212

1313
from arc.common import is_str_float
14-
from arc.constants import E_h_kJmol
14+
from arc.constants import E_h_kJmol, bohr_to_angstrom
1515
from arc.species.converter import str_to_xyz, xyz_from_data, logger
1616
from arc.parser.adapter import ESSAdapter
1717
from arc.parser.factory import register_ess_adapter
@@ -35,13 +35,22 @@ def logfile_contains_errors(self) -> Optional[str]:
3535
Returns: Optional[str]
3636
``None`` if the log file is free of errors, otherwise the error is returned as a string.
3737
"""
38-
# Not implemented for xTB.
38+
with open(self.log_file_path, 'r') as f:
39+
for line in f:
40+
if 'abnormal termination' in line.lower():
41+
return line.strip()
42+
if '[ERROR]' in line or '#ERROR' in line:
43+
return line.strip()
3944
return None
4045

4146
def parse_geometry(self) -> Optional[Dict[str, tuple]]:
4247
"""
4348
Parse the xyz geometry from an ESS log file.
4449
50+
Supports both Turbomol ``$coord`` (Bohr) and Molfile V2000 (Angstrom) formats.
51+
If the file contains multiple geometry blocks (e.g., from multiple optimization
52+
cycles), only the last one is returned.
53+
4554
Returns: Optional[Dict[str, tuple]]
4655
The cartesian geometry.
4756
"""
@@ -58,6 +67,8 @@ def parse_geometry(self) -> Optional[Dict[str, tuple]]:
5867
final_structure = True
5968
continue
6069
if final_structure and '$coord' in line:
70+
# Reset on each new $coord block so we keep only the last one
71+
coords, symbols = list(), list()
6172
in_coord_block = True
6273
continue
6374
if final_structure and 'V2000' in line and not in_coord_block:
@@ -66,16 +77,18 @@ def parse_geometry(self) -> Optional[Dict[str, tuple]]:
6677
if parts and parts[0].isdigit():
6778
atom_count = int(parts[0])
6879
molfile_line_counter = 0
80+
# Reset on each new V2000 block
81+
coords, symbols = list(), list()
6982
continue
7083

71-
# Parse $coord format
84+
# Parse $coord format (Turbomole $coord coordinates are in Bohr, convert to Angstrom)
7285
if in_coord_block:
7386
if '$' in line or 'end' in line.lower() or len(line.split()) < 4:
7487
in_coord_block = False
7588
continue
7689
parts = line.split()
7790
try:
78-
x, y, z = map(float, parts[:3])
91+
x, y, z = (float(v) * bohr_to_angstrom for v in parts[:3])
7992
symbol = parts[3].capitalize() if len(parts[3]) == 1 else parts[3][0].upper() + parts[3][1:].lower()
8093
coords.append([x, y, z])
8194
symbols.append(symbol)
@@ -105,30 +118,34 @@ def parse_frequencies(self) -> Optional[np.ndarray]:
105118
"""
106119
Parse the frequencies from a freq job output file.
107120
121+
xTB prints frequencies twice (once after the Hessian and once in the
122+
Frequency Printout section). This method reads ALL eigval blocks and
123+
returns only the last complete one to ensure we get the final values.
124+
108125
Returns: Optional[np.ndarray]
109126
The parsed frequencies (in cm^-1).
110127
"""
111-
freqs = list()
128+
# Collect all eigval blocks; use the last one
129+
all_blocks = list()
130+
current_block = list()
112131
lines = _get_lines_from_file(self.log_file_path)
113-
read_output = False
114132

115133
for line in lines:
116-
if read_output:
117-
if 'eigval :' in line:
118-
splits = line.split()
119-
for split in splits[2:]:
120-
try:
121-
freq = float(split)
122-
if freq != 0.0:
123-
freqs.append(freq)
124-
except ValueError:
125-
continue
126-
elif line.strip() == "" or "projected vibrational frequencies" in line.lower():
127-
continue
128-
else:
129-
break
130-
if 'vibrational frequencies' in line.lower():
131-
read_output = True
134+
if 'eigval :' in line:
135+
splits = line.split()
136+
for split in splits[2:]:
137+
try:
138+
current_block.append(float(split))
139+
except ValueError:
140+
continue
141+
elif current_block:
142+
# End of an eigval run
143+
all_blocks.append(current_block)
144+
current_block = list()
145+
if current_block:
146+
all_blocks.append(current_block)
147+
148+
freqs = [f for f in all_blocks[-1] if f != 0.0] if all_blocks else list()
132149

133150
# Fallback: try vibspectrum file if no frequencies found in output
134151
if not freqs:
@@ -232,28 +249,32 @@ def parse_e_elect(self) -> Optional[float]:
232249
"""
233250
Parse the electronic energy from an sp job output file.
234251
252+
Looks for ``:: total energy ... Eh`` (SUMMARY block) or
253+
``| TOTAL ENERGY ... Eh |`` (TOTAL section).
254+
Avoids false matches against ``total energy gain`` (optimization deltas).
255+
235256
Returns: Optional[float]
236257
The electronic energy in kJ/mol.
237258
"""
259+
import re
238260
lines = _get_lines_from_file(self.log_file_path)
239261
energy = None
240-
for line in reversed(lines):
241-
if 'total energy' in line.lower():
242-
try:
243-
energy = float(line.split()[3].strip())
244-
break
245-
except (ValueError, IndexError):
262+
# Iterate forward and keep the LAST hit (final result)
263+
for line in lines:
264+
stripped = line.strip()
265+
if stripped.startswith(':: total energy') or 'TOTAL ENERGY' in line:
266+
m = re.search(r'(-?\d+\.\d+)\s+Eh', line)
267+
if m:
268+
energy = float(m.group(1))
269+
if energy is None:
270+
# Fallback: 'final energy' lines (rare)
271+
for line in reversed(lines):
272+
if 'final energy' in line.lower():
246273
try:
247-
energy = float(line.split()[-1].strip())
274+
energy = float(line.split()[-1])
248275
break
249276
except (ValueError, IndexError):
250277
continue
251-
if 'final energy' in line.lower():
252-
try:
253-
energy = float(line.split()[-1])
254-
break
255-
except (ValueError, IndexError):
256-
continue
257278
if energy is not None:
258279
return energy * E_h_kJmol
259280
return None
@@ -265,16 +286,16 @@ def parse_zpe_correction(self) -> Optional[float]:
265286
Returns: Optional[float]
266287
The calculated zero point energy in kJ/mol.
267288
"""
289+
import re
268290
zpe = None
269291
with open(self.log_file_path, 'r') as f:
270292
for line in f:
271293
if 'zero-point vibrational energy' in line.lower() or 'zero point energy' in line.lower():
272294
# :: zero point energy 0.056690417480 Eh ::
273-
try:
274-
zpe = float(line.split()[-3])
295+
m = re.search(r'(\d+\.\d+(?:[eE][+-]?\d+)?)\s+Eh', line)
296+
if m:
297+
zpe = float(m.group(1))
275298
break
276-
except (ValueError, IndexError):
277-
continue
278299
if zpe is not None:
279300
return zpe * E_h_kJmol
280301
return None
@@ -323,8 +344,12 @@ def parse_1d_scan_energies(self) -> Tuple[Optional[List[float]], Optional[List[f
323344
logger.warning(f'No valid scan points found in xTB scan log file {scan_path}.')
324345
return None, None
325346

326-
# Angles: evenly spaced 0 to 360 inclusive
327-
angles = [i * 360.0 / (n_points + 1) for i in range(n_points + 1)]
347+
# Angles: evenly spaced from 0 deg with one angle per energy point.
348+
# For 44 energies, the dihedral was scanned in steps of 360/45 = 8 deg.
349+
# We return n_points angles (matching the energies length): [0, 8, 16, ..., 344].
350+
# Note: callers expecting (n_points+1) angles for n_points energies should
351+
# add the closing 360 deg themselves.
352+
angles = [i * 360.0 / (n_points + 1) for i in range(n_points)]
328353

329354
return rel_energies, angles
330355

arc/parser/parser_test.py

Lines changed: 87 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -480,18 +480,18 @@ def test_parse_geometry_2(self):
480480
self.assertIsInstance(xyz_1, dict)
481481
self.assertEqual(len(xyz_1['symbols']), 53)
482482

483-
path_2 = os.path.join(ARC_TESTING_PATH, 'opt', 'xtb_opt_1.out') # Turbomol format
483+
path_2 = os.path.join(ARC_TESTING_PATH, 'opt', 'xtb_opt_1.out') # Turbomol format ($coord is in Bohr)
484484
xyz_2 = parser.parse_geometry(log_file_path=path_2)
485485
expected_xyz_2 = {'symbols': ('C', 'C', 'C', 'O', 'H', 'H', 'H', 'H'),
486486
'isotopes': (12, 12, 12, 16, 1, 1, 1, 1),
487-
'coords': ((-2.1908361683609, -0.0875055545944093, -0.712358508847116),
488-
(-0.170663821576948, -0.721009080780027, 0.6339514359292),
489-
(2.27150340995404, 0.539200343733434, 0.305747355748416),
490-
(4.16185183882216, 0.0456870795397582, 1.46958543963615),
491-
(-2.10906550336131, 1.39281538085596, -2.11159862590894),
492-
(-3.98822887666127, -1.01150815322938, -0.474092464462499),
493-
(-0.230883305078883, -2.20128348308961, 2.04098984698359),
494-
(2.25632242626311, 2.04360346756428, -1.15222446018155))}
487+
'coords': ((-1.1593401110647, -0.0463059268636, -0.3769637386362),
488+
(-0.0903113691106, -0.3815414223399, 0.3354725190107),
489+
(1.2020273599692, 0.2853324202958, 0.1617944684729),
490+
(2.2023562705124, 0.0241765516896, 0.7776708141903),
491+
(-1.1160689558722, 0.7370458647952, -1.1174094260626),
492+
(-2.110478992265, -0.5352668500015, -0.2508788280669),
493+
(-0.1221781347317, -1.1648685897309, 1.0800448842572),
494+
(1.1939939325626, 1.0814279521553, -0.6097306831655))}
495495
self.assertTrue(almost_equal_coords(xyz_2, expected_xyz_2))
496496

497497
path_3 = os.path.join(ARC_TESTING_PATH, 'opt', 'xtb_opt_2.out') # SDF format
@@ -732,10 +732,12 @@ def test_parse_1d_scan_energies(self):
732732
8.397974544321187, 7.071194090611243, 5.214457982623571, 3.4362612986915337,
733733
1.958199294316728, 0.8766536693692615, 0.22504856466548517,
734734
0.0004629911018128041])
735+
# Length-matched: 44 energies -> 44 angles (step = 360/(n+1) = 8 deg)
736+
self.assertEqual(len(angles_3), len(energies_3))
735737
self.assertEqual(angles_3, [0.0, 8.0, 16.0, 24.0, 32.0, 40.0, 48.0, 56.0, 64.0, 72.0, 80.0, 88.0, 96.0,
736738
104.0, 112.0, 120.0, 128.0, 136.0, 144.0, 152.0, 160.0, 168.0, 176.0, 184.0, 192.0,
737739
200.0, 208.0, 216.0, 224.0, 232.0, 240.0, 248.0, 256.0, 264.0, 272.0, 280.0, 288.0,
738-
296.0, 304.0, 312.0, 320.0, 328.0, 336.0, 344.0, 352.0])
740+
296.0, 304.0, 312.0, 320.0, 328.0, 336.0, 344.0])
739741

740742
path_4 = os.path.join(ARC_TESTING_PATH, 'rotor_scans', 'orca', 'cc.txt')
741743
energies_4, angles_4 = parser.parse_1d_scan_energies(log_file_path=path_4)
@@ -1000,6 +1002,81 @@ def test_parse_active_space(self):
10001002
xyz="""N 0.0 0.0 0.0"""))
10011003
self.assertEqual(active, {'e_o': (5, 4), 'occ': [3, 1, 1, 0, 1, 0, 0, 0], 'closed': [1, 0, 0, 0, 0, 0, 0, 0]})
10021004

1005+
# ----- xTB-specific improvements -----
1006+
1007+
def test_xtb_logfile_contains_errors(self):
1008+
"""xTB error detection should return None for clean files."""
1009+
from arc.parser.adapters.xtb import XTBParser
1010+
clean_path = os.path.join(ARC_TESTING_PATH, 'sp', 'NCC_xTB.out')
1011+
adapter = XTBParser(log_file_path=clean_path)
1012+
self.assertIsNone(adapter.logfile_contains_errors())
1013+
1014+
def test_xtb_parse_e_elect_avoids_false_match(self):
1015+
"""parse_e_elect should not match 'total energy gain' lines from optimization deltas.
1016+
1017+
The CO2_xtb.out file contains both 'total energy gain : -0.1724555 Eh' (delta)
1018+
and ':: total energy -10.308452243026 Eh' (final). The parser must return the
1019+
final value, not the delta.
1020+
"""
1021+
from arc.constants import E_h_kJmol
1022+
path = os.path.join(ARC_TESTING_PATH, 'freq', 'CO2_xtb.out')
1023+
e_elect = parser.parse_e_elect(path)
1024+
self.assertAlmostEqual(e_elect, -10.308452243026 * E_h_kJmol, places=3)
1025+
# Make sure we did NOT pick up the delta value
1026+
delta_value = -0.1724555 * E_h_kJmol # ~-452 kJ/mol
1027+
self.assertNotAlmostEqual(e_elect, delta_value, places=1)
1028+
1029+
def test_xtb_parse_frequencies_takes_last_block(self):
1030+
"""parse_frequencies should return the LAST eigval block.
1031+
1032+
TS_NH2+N2H3_xtb.out prints frequencies twice. Both blocks should be identical;
1033+
if the parser stops after the first block, it would still get the right values,
1034+
but using the last block is more robust against truncated/duplicate blocks.
1035+
"""
1036+
path = os.path.join(ARC_TESTING_PATH, 'freq', 'TS_NH2+N2H3_xtb.out')
1037+
freqs = parser.parse_frequencies(log_file_path=path)
1038+
self.assertIsNotNone(freqs)
1039+
# Imaginary frequency should be the first non-zero one
1040+
self.assertAlmostEqual(freqs[0], -781.89, places=1)
1041+
# Last frequency
1042+
self.assertAlmostEqual(freqs[-1], 3467.59, places=1)
1043+
# 24 modes total (3*8 - 6 = 18 + 6 zero = 24, but only 18 non-zero in TS)
1044+
self.assertEqual(len(freqs), 18)
1045+
1046+
def test_xtb_parse_zpe_correction_regex(self):
1047+
"""parse_zpe_correction uses regex to find ZPE in scientific notation safely."""
1048+
from arc.constants import E_h_kJmol
1049+
path = os.path.join(ARC_TESTING_PATH, 'freq', 'TS_NH2+N2H3_xtb.out')
1050+
zpe = parser.parse_zpe_correction(log_file_path=path)
1051+
self.assertIsNotNone(zpe)
1052+
self.assertAlmostEqual(zpe, 0.056690417480 * E_h_kJmol, places=3)
1053+
1054+
def test_xtb_parse_1d_scan_energies_length_match(self):
1055+
"""Energies and angles returned by parse_1d_scan_energies must have matching lengths."""
1056+
path = os.path.join(ARC_TESTING_PATH, 'rotor_scans', 'xtb_1', 'output.out')
1057+
energies, angles = parser.parse_1d_scan_energies(log_file_path=path)
1058+
self.assertIsNotNone(energies)
1059+
self.assertIsNotNone(angles)
1060+
self.assertEqual(len(energies), len(angles))
1061+
# First angle = 0, all angles strictly increasing
1062+
self.assertEqual(angles[0], 0.0)
1063+
for i in range(1, len(angles)):
1064+
self.assertGreater(angles[i], angles[i - 1])
1065+
# All angles should be in [0, 360)
1066+
self.assertLess(angles[-1], 360.0)
1067+
1068+
def test_xtb_parse_geometry_resets_on_new_block(self):
1069+
"""parse_geometry should not accumulate atoms across multiple geometry blocks.
1070+
1071+
For CO2_xtb.out (Turbomol $coord with 3 atoms), the parser should return
1072+
exactly 3 atoms, not 6 or more even if the file mentions multiple blocks.
1073+
"""
1074+
path = os.path.join(ARC_TESTING_PATH, 'freq', 'CO2_xtb.out')
1075+
xyz = parser.parse_geometry(log_file_path=path)
1076+
self.assertIsNotNone(xyz)
1077+
self.assertEqual(len(xyz['symbols']), 3)
1078+
self.assertEqual(xyz['symbols'], ('O', 'C', 'O'))
1079+
10031080

10041081
def test_parse_opt_steps(self):
10051082
"""Test parsing the number of optimization steps from various ESS output files."""

0 commit comments

Comments
 (0)