Skip to content

Commit 11826f6

Browse files
committed
refactor thermo extract_source_from_comments
This refactors the extract_source_from_comments function and changes library sources so that a tuple of the library name and the library entry object is returned instead of just the library name. It also returns a tuple of QM method and molecule used for the case of QM sources.
1 parent acf84f6 commit 11826f6

2 files changed

Lines changed: 222 additions & 162 deletions

File tree

rmgpy/data/thermo.py

Lines changed: 184 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,7 +1643,7 @@ def correct_binding_energy(self, thermo, species, metal_to_scale_from=None, meta
16431643
thermo.comment += f" Binding energy corrected by LSR ({'+'.join(comments)}) from {metal_to_scale_from} (H={change_in_binding_energy/1e3:+.0f}kJ/mol)"
16441644
return thermo
16451645

1646-
def get_thermo_data_for_surface_species(self, species):
1646+
def get_thermo_data_for_surface_species(self, species, return_resonance_data=False):
16471647
"""
16481648
Get the thermo data for an adsorbed species,
16491649
by desorbing it, finding the thermo of the gas-phase
@@ -1652,6 +1652,9 @@ def get_thermo_data_for_surface_species(self, species):
16521652
Does not apply linear scaling relationship.
16531653
16541654
Returns a :class:`ThermoData` object, with no Cp0 or CpInf
1655+
1656+
option to return resonance data if return_resonance_data is True,
1657+
which is useful for identifying the specific molecule chosen
16551658
"""
16561659

16571660
# define the comparison function to find the lowest energy
@@ -1741,7 +1744,7 @@ def species_enthalpy(species):
17411744
atom.label = label
17421745

17431746
# a tuple of molecule and its thermo
1744-
resonance_data.append((molecule, thermo))
1747+
resonance_data.append((molecule, thermo, gas_phase_species))
17451748

17461749
# Get the enthalpy of formation of every adsorbate at 298 K and
17471750
# determine the resonance structure with the lowest enthalpy of formation.
@@ -1751,7 +1754,7 @@ def species_enthalpy(species):
17511754
resonance_data = sorted(resonance_data, key=lambda x: x[1].H298.value_si)
17521755

17531756
# reorder the resonance structures (molecules) by their H298
1754-
species.molecule = [mol for mol, thermo in resonance_data]
1757+
species.molecule = [mol for mol, thermo, gas_phase_species in resonance_data]
17551758

17561759
thermo = resonance_data[0][1]
17571760

@@ -1760,6 +1763,9 @@ def species_enthalpy(species):
17601763

17611764
find_cp0_and_cpinf(species, thermo)
17621765

1766+
if return_resonance_data is True: # is True makes this a little more error-proof in case user accidentally provides another argument that evaluates to True
1767+
return thermo, resonance_data
1768+
17631769
return thermo
17641770

17651771
def _add_adsorption_correction(self, adsorption_thermo, adsorption_groups, molecule, surface_sites):
@@ -2854,82 +2860,9 @@ def get_ring_groups_from_comments(self, thermo_data):
28542860

28552861
return ring_groups, polycyclic_groups
28562862

2857-
def extract_source_from_comments(self, species):
2858-
"""
2859-
`species`: A species object containing thermo data and thermo data comments
2860-
2861-
Parses the verbose string of comments from the thermo data of the species object,
2862-
and extracts the thermo sources.
2863-
2864-
Returns a dictionary with keys of 'Library', 'QM', 'ADS', and/or 'GAV'.
2865-
Commonly, species thermo are estimated using only one of these sources.
2866-
However, a radical can be estimated with more than one type of source, for
2867-
instance a saturated library value and a GAV HBI correction, or a QM saturated value
2868-
and a GAV HBI correction. Adsorbates can be estimated using a single library
2869-
for the adsorbate or a combination of a gas phase library for the
2870-
gas phase portion and an adsorption correction.
2871-
2872-
source = {'Library': String_Name_of_Library_Used,
2873-
'QM': String_of_Method_Used,
2874-
'GAV': Dictionary_of_Groups_Used,
2875-
'ADS': Dictionary_of_Adsorption_Group_Used,
2876-
}
2877-
2878-
The Dictionary_of_Groups_Used looks like
2879-
{'groupType':[List of tuples containing (Entry, Weight)]
2880-
"""
2881-
comment = species.thermo.comment
2882-
tokens = comment.split()
2883-
2884-
source = {}
2885-
2886-
if comment.startswith('Thermo library'):
2887-
# Store name of the library source, which is the 3rd token in the comments
2888-
source['Library'] = tokens[2]
2889-
2890-
elif comment.startswith('QM'):
2891-
# Store the level of the calculation, which is the 2nd token in the comments
2892-
source['QM'] = tokens[1]
2893-
2894-
elif comment.startswith('Gas phase thermo'):
2895-
# Handle adsorption correction thermo data of the following format:
2896-
# Library example
2897-
# Gas phase thermo for C(T) from Thermo library: primaryThermoLibrary.
2898-
# Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(Cq*)
2899-
2900-
# GAV example
2901-
# Gas phase thermo for [CH]CC from Thermo group additivity estimation: group(Cs-CsCsHH) + group(Cs-CsHHH) + group(Cs-CsHHH) + radical(CCJ2_triplet).
2902-
# Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C=*RCR3)"
2903-
2904-
comment = comment.replace(r'\n', ' ')
2905-
comment = comment.replace('\n', ' ')
2906-
if 'Adsorption correction:' not in comment:
2907-
raise ValueError(f'adsorption correction in unrecognized format {comment}')
2908-
2909-
# Handle the gas-phase portion first
2910-
gas_comment = comment.split('Adsorption correction: + ')[0].strip()
2911-
if gas_comment.endswith('.'):
2912-
gas_comment = gas_comment[:-1] # delete the . at the end if it exists
2913-
gas_comment = gas_comment[gas_comment.find('from ', len('Gas phase thermo for ')) + len('from '):]
2914-
dummy_gas_phase_species = Species()
2915-
dummy_gas_phase_species.thermo = NASA()
2916-
dummy_gas_phase_species.thermo.comment = gas_comment
2917-
source = self.extract_source_from_comments(dummy_gas_phase_species)
2918-
2919-
# This is an adsorption correction
2920-
# comment is split into two parts: the gas phase, and the surface adsorption correction
2921-
ads_correction_comment = comment.split('Adsorption correction: +')[-1].strip()
2922-
dummy_adsorption_correction_species = Species()
2923-
dummy_adsorption_correction_species.thermo = NASA()
2924-
dummy_adsorption_correction_species.thermo.comment = ads_correction_comment
2925-
source['ADS'] = self.extract_source_from_comments(dummy_adsorption_correction_species)['GAV']
2926-
2927-
return source
2928-
2929-
# Check for group additivity contributions to the thermo in this species
2930-
2931-
# The contribution of the groups can be either additive or subtracting
2932-
# after changes to the polycyclic algorithm
2863+
def _parse_gav_groups(self, comment):
2864+
"""Extract the groups from the comment"""
2865+
groups = {}
29332866

29342867
comment = comment.replace(' + ', ' +')
29352868
comment = comment.replace(' - ', ' -')
@@ -2939,10 +2872,12 @@ def extract_source_from_comments(self, species):
29392872
# groups are still split by spaces
29402873
comment = comment.replace(')\n+', ') +')
29412874
comment = comment.replace(')\n-', ') -')
2875+
# `Thermo group additivity estimation:\nadsorptionPt111(...)` shows up in
2876+
# adsorbate comments - keep the trailing colon separated from the group token.
2877+
comment = comment.replace(':\n', ': ')
29422878
comment = comment.replace('\n', '')
29432879
tokens = comment.split(' ')
29442880

2945-
groups = {}
29462881
group_types = list(self.groups.keys())
29472882

29482883
regex = r"\((.*)\)" # only hit outermost parentheses
@@ -2970,14 +2905,180 @@ def extract_source_from_comments(self, species):
29702905

29712906
if groups:
29722907
# Indicate that group additivity is used when it is either an HBI correction
2973-
# onto a thermo library or QM value, or if the entire molecule is estimated using group additivity
2908+
# onto a thermo library or QM value, or if the entire molecule is estimated using group additivity
29742909
# Save the groups into the source dictionary
29752910

2976-
# Convert groups back into tuples
2911+
# Convert groups back into tuples
29772912
for groupType, groupDict in groups.items():
29782913
groups[groupType] = list(groupDict.items())
29792914

2980-
source['GAV'] = groups
2915+
return groups
2916+
2917+
def _parse_library_source(self, comment, library_species):
2918+
# handle the library source comment, which looks like "Thermo library: library_name"
2919+
# we then need to retrieve the specific library entry given the species
2920+
# but may have unfortunate line breaks in the middle
2921+
2922+
# trim the comment down to just the library portion so it starts with Thermo library:
2923+
split_loc = comment.find('Thermo library:')
2924+
if split_loc == -1:
2925+
raise ValueError(f"Expected 'Thermo library:' in comment, got {comment}")
2926+
2927+
comment = comment[split_loc:]
2928+
2929+
# library name is the token that comes immediately after 'Thermo library:'
2930+
assert 'Thermo library:' in comment, f"Expected 'Thermo library:' in comment, got {comment}"
2931+
tokens = comment.split()
2932+
library_name = tokens[2]
2933+
2934+
results = self.get_thermo_data_from_library(library_species, self.libraries[library_name])
2935+
if results is None:
2936+
raise DatabaseError(f"Could not find a library match for {library_species} in library {library_name}")
2937+
2938+
data, thermo_library, library_entry = results
2939+
return (library_name, library_entry)
2940+
2941+
def _parse_adsorption_correction(self, comment):
2942+
# handle the adsorption correction comment, which looks like
2943+
# "Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR2CR3)"
2944+
# but may have unfortunate line breaks in the middle
2945+
2946+
# check that the number of tokens matches our expectation for an adsorption correction
2947+
# should be 8, maybe 9 if there was a weird line break
2948+
tokens = comment.split()
2949+
if len(tokens) not in [8, 9]:
2950+
raise ValueError(f"Expected 8 or 9 tokens in adsorption correction comment, got {len(tokens)}: {comment}")
2951+
2952+
ADS = self._parse_gav_groups(comment)
2953+
2954+
if len(ADS) > 1:
2955+
raise ValueError("Only adsorption corrections should be present in the adsorption correction portion of the comment. Found: {}".format(ADS))
2956+
2957+
return ADS
2958+
2959+
def extract_source_from_comments(self, species):
2960+
"""
2961+
`species`: A species object containing thermo data and thermo data comments
2962+
2963+
Parses the verbose string of comments from the thermo data of the species object,
2964+
and extracts the thermo sources.
2965+
2966+
Returns a dictionary with keys of 'Library', 'QM', 'ADS', and/or 'GAV'.
2967+
Commonly, species thermo are estimated using only one of these sources.
2968+
However, a radical can be estimated with more than one type of source, for
2969+
instance a saturated library value and a GAV HBI correction, or a QM saturated value
2970+
and a GAV HBI correction. Adsorbates can be estimated using a single library
2971+
for the adsorbate or a combination of a gas phase library for the
2972+
gas phase portion and an adsorption correction.
2973+
2974+
source = {'Library': (String_Name_of_Library_Used, Library_Entry_Used),
2975+
'QM': (String_of_Method_Used, Molecule_Used_for_QM),
2976+
'GAV': Dictionary_of_Groups_Used,
2977+
'ADS': Dictionary_of_Adsorption_Group_Used,
2978+
}
2979+
2980+
The Dictionary_of_Groups_Used looks like
2981+
{'groupType':[List of tuples containing (Entry, Weight)]
2982+
"""
2983+
2984+
# TODO: solvent, electrocat, LSR
2985+
source = {}
2986+
2987+
comment = species.thermo.comment
2988+
tokens = comment.split()
2989+
2990+
ads_correction = 'Gas phase thermo' in comment and 'Adsorption correction:' in comment
2991+
library = 'Thermo library' in comment
2992+
QM = 'QM' in tokens
2993+
GAV = 'Thermo group additivity estimation:' in comment # ambiguous since ads correction looks identical to group
2994+
2995+
# the biggest thing to split on first is the adsorption correction
2996+
if ads_correction:
2997+
# The source options here are:
2998+
# (Library(gas-phase species), Adsorption correction)
2999+
# (QM(gas-phase species), Adsorption correction) <--- not really, QM is dead/sleeping
3000+
# (GAV(gas-phase species), Adsorption correction)
3001+
# (Library(gas-phase species), GAV(radical correction), Adsorption correction)
3002+
# (QM(gas-phase species), GAV(radical correction), Adsorption correction) <--- not really, QM is dead/sleeping
3003+
3004+
# split the comment into the gas phase thermo portion and the adsorption correction portion
3005+
split_loc = comment.find('Adsorption correction:')
3006+
if split_loc == -1:
3007+
raise ValueError(f"Expected 'Adsorption correction:' in comment, got {comment}")
3008+
gas_comment = comment[:split_loc].strip()
3009+
if gas_comment.endswith('.'):
3010+
gas_comment = gas_comment[:-1] # the period that closed the gas-phase sentence
3011+
ads_correction_comment = comment[split_loc:].strip()
3012+
source['ADS'] = self._parse_adsorption_correction(ads_correction_comment)
3013+
3014+
# get the desorbed gas species
3015+
thermo, resonance_data = self.get_thermo_data_for_surface_species(species, return_resonance_data=True)
3016+
desorbed_gas_species = resonance_data[0][2]
3017+
3018+
groups = self._parse_gav_groups(gas_comment)
3019+
if groups: # (Library/QM + GAV + ADS) or (GAV + ADS)
3020+
source['GAV'] = groups # handle the GAV portion
3021+
3022+
if library or QM: # (Library/QM + GAV + ADS)
3023+
# this means the library/QM species is the desorbed, saturated gas-phase version of the adsorbate
3024+
3025+
assert desorbed_gas_species.molecule[0].is_radical(), "Method only valid for radicals."
3026+
molecule = desorbed_gas_species.molecule[0] # no need to deepcopy again since get_desorbed_molecules already does deepcopy
3027+
molecule.saturate_radicals() # note, this returns a dictionary instead of the Molecule object, but it modifies the molecule in place, so we can just ignore the returned dictionary
3028+
saturated_desorbed_gas_species = Species(molecule=[molecule])
3029+
3030+
if library: # (Library(gas-phase species), GAV(radical correction), Adsorption correction)
3031+
source['Library'] = self._parse_library_source(gas_comment, saturated_desorbed_gas_species)
3032+
if QM: # (QM(gas-phase species), GAV(radical correction), Adsorption correction)
3033+
# whatever token comes immediately after 'QM' is the method used
3034+
source['QM'] = (tokens[tokens.index('QM') + 1], saturated_desorbed_gas_species)
3035+
3036+
else:
3037+
# no groups, so this is (Library/QM + ADS)
3038+
# in this case, the library/QM species is the desorbed gas-phase molecule of the adsorbate
3039+
if library:
3040+
source['Library'] = self._parse_library_source(gas_comment, desorbed_gas_species)
3041+
if QM:
3042+
# whatever token comes immediately after 'QM' is the method used
3043+
source['QM'] = (tokens[tokens.index('QM') + 1], desorbed_gas_species)
3044+
3045+
else:
3046+
# gas phase only, source options are:
3047+
# (Library)
3048+
# (QM)
3049+
# (GAV)
3050+
# (Library, GAV)
3051+
# (QM, GAV)
3052+
groups = self._parse_gav_groups(comment)
3053+
GAV = 'Thermo group additivity estimation:' in comment
3054+
if GAV and not groups:
3055+
raise ValueError("No groups were found in the comments but 'Thermo group additivity estimation:' was in the comment. Comment: {}".format(comment))
3056+
elif not GAV and groups:
3057+
if 'radical' not in groups.keys():
3058+
raise ValueError("Groups were found in the comments but 'Thermo group additivity estimation:' was not in the comment. Comment: {}".format(comment))
3059+
3060+
if groups:
3061+
# Get groups first
3062+
source['GAV'] = groups
3063+
if library or QM: # (Library/QM, GAV)
3064+
# get the saturated species for the library source
3065+
if 'radical' not in groups.keys():
3066+
raise ValueError("Method only valid for radicals, but no radical groups were found. Comment: {}".format(comment))
3067+
molecule = deepcopy(species.molecule[0])
3068+
assert molecule.is_radical(), "Method only valid for radicals."
3069+
molecule.saturate_radicals() # note, this returns a dictionary instead of the Molecule object, but it modifies the molecule in place, so we can just ignore the returned dictionary
3070+
saturated_species = Species(molecule=[molecule])
3071+
if library: # (Library, GAV)
3072+
source['Library'] = self._parse_library_source(comment, saturated_species)
3073+
if QM: # (QM, GAV)
3074+
# whatever token comes immediately after 'QM' is the method used
3075+
source['QM'] = (tokens[tokens.index('QM') + 1], saturated_species)
3076+
else: # (Library) or (QM)
3077+
if library:
3078+
source['Library'] = self._parse_library_source(comment, species)
3079+
if QM:
3080+
# whatever token comes immediately after 'QM' is the method used
3081+
source['QM'] = (tokens[tokens.index('QM') + 1], species)
29813082

29823083
# Perform a sanity check that this molecule is estimated by at least one method
29833084
if not list(source.keys()):

0 commit comments

Comments
 (0)