Skip to content

Commit 415cb42

Browse files
committed
add ability to load BEEF correlations for UQ
1 parent 731b402 commit 415cb42

1 file changed

Lines changed: 206 additions & 10 deletions

File tree

rmgpy/tools/uncertainty.py

Lines changed: 206 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@
2828
###############################################################################
2929

3030
import os
31-
import re
3231

3332
import numpy as np
3433

34+
import rmgpy.data.thermo
3535
import rmgpy.util as util
3636
from rmgpy.species import Species
3737
from rmgpy.tools.data import GenericData
@@ -43,7 +43,7 @@ class ThermoParameterUncertainty(object):
4343
This class is an engine that generates the species uncertainty based on its thermo sources.
4444
"""
4545

46-
def __init__(self, dG_library=1.5, dG_QM=3.0, dG_GAV=1.5, dG_group=0.7159, dG_ADS_correction=6.918, dG_surf_lib=6.918):
46+
def __init__(self, dG_library=1.5, dG_QM=3.0, dG_GAV=1.5, dG_group=0.7159, dG_ADS_correction=6.918, dG_surf_lib=6.918, other_covariances=None):
4747
"""
4848
Initialize the different uncertainties dG_library, dG_QM, dG_GAV, and dG_other with set values
4949
in units of kcal/mol.
@@ -57,6 +57,7 @@ def __init__(self, dG_library=1.5, dG_QM=3.0, dG_GAV=1.5, dG_group=0.7159, dG_AD
5757
self.dG_group = dG_group
5858
self.dG_ADS_correction = dG_ADS_correction
5959
self.dG_surf_lib = dG_surf_lib
60+
self.other_covariances = other_covariances # storage of covariances as a dict. Keys are sorted tuples of parameter labels and values are covariances
6061

6162
def get_uncertainty_value(self, source):
6263
"""
@@ -66,7 +67,14 @@ def get_uncertainty_value(self, source):
6667
if 'Library' in source:
6768
varG += self.dG_library * self.dG_library
6869
if 'Surface_Library' in source:
69-
varG += self.dG_surf_lib * self.dG_surf_lib
70+
surf_lib_varG = self.dG_surf_lib * self.dG_surf_lib
71+
# covariance libraries should overrule the default uncertainties when available
72+
if self.other_covariances is not None:
73+
label = f'Surface_Library {source["Surface_Library"]}' # match the covariance dict label format
74+
cov_label = (label, label)
75+
if cov_label in self.other_covariances:
76+
surf_lib_varG = self.other_covariances[cov_label]
77+
varG += surf_lib_varG # Add the variance of the surface library parameter if covariance is not specified in the covariance libraries
7078
if 'QM' in source:
7179
varG += self.dG_QM * self.dG_QM
7280
if 'GAV' in source:
@@ -172,6 +180,12 @@ def _get_covariance_qq(self, q_label1, q_label2):
172180
if corr_type1 is None or corr_type2 is None:
173181
raise ValueError(f'Could not determine the type of the correlated parameters from their labels {q_label1} and {q_label2}')
174182

183+
if self.other_covariances is not None:
184+
# check if covariance is specified in other_covariances dict
185+
sorted_labels = tuple(sorted([q_label1, q_label2]))
186+
if sorted_labels in self.other_covariances:
187+
return self.other_covariances[sorted_labels]
188+
175189
if corr_type1 != corr_type2:
176190
return 0
177191
elif q_label1 == q_label2:
@@ -394,11 +408,15 @@ class Uncertainty(object):
394408
for a single RMG-generated mechanism.
395409
"""
396410

397-
def __init__(self, species_list=None, reaction_list=None, output_directory=''):
411+
def __init__(self, species_list=None, reaction_list=None, output_directory='', thermo_covariance_libraries=None,
412+
kinetic_covariance_libraries=None, thermo_covariance_groups=None):
398413
"""
399414
`species_list`: list of RMG species objects
400415
`reaction_list`: list of RMG reaction objects
401416
`outputDirectoy`: directory path for saving output files from the analyses
417+
`thermo_covariance_libraries`: list of library paths to pull additional thermo covariances from
418+
`kinetic_covariance_libraries`: list of library paths to pull additional kinetic covariances from
419+
`thermo_covariance_groups`: list of groups to get additional thermo covariances from
402420
"""
403421
self.database = None
404422
self.species_list = species_list
@@ -418,6 +436,11 @@ def __init__(self, species_list=None, reaction_list=None, output_directory=''):
418436
self.all_thermo_intermediates = None # list of labels of underlying thermo parameters
419437
self.all_kinetics_intermediates = None # list of labels of underlying kinetic parameters
420438
self.output_directory = output_directory if output_directory else os.getcwd()
439+
self.thermo_covariance_libraries = thermo_covariance_libraries
440+
self.thermo_covariance_groups = thermo_covariance_groups
441+
self.kinetic_covariance_libraries = kinetic_covariance_libraries
442+
self.thermo_covariances_dict = {} # dictionary to store covariances from covariance libraries
443+
self.kinetic_covariances_dict = {} # dictionary to store covariances from covariance libraries
421444

422445
# For extra species needed for correlated analysis but not in model
423446
self.extra_species = []
@@ -507,6 +530,154 @@ def retrieve_saturated_species_from_list(self, species):
507530
else:
508531
raise Exception('Could not retrieve saturated species form of {0} from the species list'.format(species))
509532

533+
def load_thermo_covariances_from_libraries(self):
534+
from rmgpy.chemkin import load_species_dictionary
535+
assert self.database is not None, 'Must load database before loading covariance libraries, since we need the path to the covariance libraries from the database'
536+
if self.thermo_covariance_libraries is not None:
537+
for cov_lib in self.thermo_covariance_libraries:
538+
library_name = os.path.basename(cov_lib)
539+
if library_name in self.database.thermo.libraries:
540+
library = self.database.thermo.libraries[library_name]
541+
else:
542+
raise ValueError(f'Thermo covariance library {library_name} not found in the loaded database')
543+
544+
covariance_file = os.path.join(cov_lib, 'covariance.npy')
545+
covariance_species = os.path.join(cov_lib, 'species_dictionary.txt')
546+
if not os.path.isfile(covariance_file):
547+
raise ValueError(f'Thermo covariance file {covariance_file} not found in library {cov_lib}')
548+
if not os.path.isfile(covariance_species):
549+
raise ValueError(f'Thermo species file {covariance_species} not found in library {cov_lib}')
550+
cov_data = np.load(covariance_file)
551+
cov_species_dict = load_species_dictionary(covariance_species)
552+
cov_specs = [item for _, item in cov_species_dict.items()]
553+
554+
# quick check to make sure the covariance data and molecule data are consistent with each other
555+
if cov_data.shape[0] != len(cov_specs):
556+
raise ValueError(f'Covariance data and molecule data in library {cov_lib} are inconsistent: covariance data has shape {cov_data.shape} but molecule data has length {len(cov_specs)}')
557+
558+
# load the labels, but only include species in the model
559+
subset_indices = [] # keep track of indices relevant to the model
560+
for i_lib, lib_species in enumerate(cov_specs):
561+
i_sp = get_i_thing(lib_species, self.species_list)
562+
if i_sp < 0:
563+
continue
564+
565+
# make sure the species actually comes from this library, otherwise skip
566+
result = self.database.thermo.get_thermo_data_from_library(lib_species, library)
567+
if result is not None:
568+
surface_prefix = 'Surface_' if lib_species.contains_surface_site() else ''
569+
# match the label as constructed in assign_intermediate_uncertainties,
570+
# where the number corresponds to the index of the species in species_list
571+
try:
572+
label = f'{surface_prefix}Library {self.species_list[i_sp].to_chemkin()}'
573+
except IndexError:
574+
label = f'{surface_prefix}Library {self.extra_species[i_sp - len(self.species_list)].to_chemkin()}'
575+
lib_species.label = label
576+
subset_indices.append(i_lib)
577+
578+
# fill in the dictionary of covariances from the covariance libraries,
579+
# with keys being sorted tuples of the labels of the correlated parameters
580+
# and values being the covariance between those parameters
581+
tolerance = 1e-12 # consider anything with covariance less than this to be uncorrelated
582+
for i, index_i in enumerate(subset_indices):
583+
for j in range(i, len(subset_indices)):
584+
index_j = subset_indices[j]
585+
if cov_data[index_i, index_j] > tolerance:
586+
label1 = cov_specs[index_i].label
587+
label2 = cov_specs[index_j].label
588+
covariance = cov_data[index_i, index_j]
589+
self.thermo_covariances_dict[tuple(sorted([label1, label2]))] = covariance
590+
591+
def load_thermo_covariances_from_groups(self):
592+
# assumes there might also be covariances associated with library entries
593+
594+
# associated library is hardcoded for now
595+
associated_libraries = {
596+
'adsorptionPt111': 'surfaceThermoPt111',
597+
}
598+
associated_library = None
599+
600+
assert self.database is not None, 'Must load database before loading covariance groups, since we need the path to the covariance groups/libraries from the database'
601+
if self.thermo_covariance_groups is not None:
602+
for cov_group_tree in self.thermo_covariance_groups:
603+
cov_group_tree_name = os.path.basename(cov_group_tree)
604+
if cov_group_tree_name in self.database.thermo.groups:
605+
grouptree = self.database.thermo.groups[cov_group_tree_name]
606+
else:
607+
raise ValueError(f'Thermo covariance library {cov_group_tree_name} not found in the loaded database')
608+
if cov_group_tree_name in associated_libraries:
609+
library_name = associated_libraries[cov_group_tree_name]
610+
if library_name in self.database.thermo.libraries:
611+
associated_library = self.database.thermo.libraries[library_name]
612+
else:
613+
raise ValueError(f'Associated library {library_name} for covariance group {cov_group_tree_name} not found in the loaded database')
614+
615+
covariance_file = os.path.join(cov_group_tree, 'covariance.npy')
616+
group_database_file = os.path.join(cov_group_tree, 'groups.py')
617+
group_database = rmgpy.data.thermo.ThermoGroups()
618+
group_database.load(group_database_file)
619+
620+
covariance_molecules = os.path.join(cov_group_tree, 'molecules.pickle')
621+
if not os.path.isfile(covariance_file):
622+
raise ValueError(f'Thermo covariance file {covariance_file} not found in {cov_group_tree}')
623+
if not os.path.isfile(covariance_molecules):
624+
raise ValueError(f'Thermo molecules file {covariance_molecules} not found in {cov_group_tree}')
625+
cov_data = np.load(covariance_file)
626+
627+
# reconstruct the groups and molecules stored in the molecules.pickle file
628+
629+
with open(covariance_molecules, 'rb') as f:
630+
adj_lists = pickle.load(f)
631+
reconstructed_items = []
632+
for i in range(len(adj_lists)):
633+
try:
634+
item = rmgpy.molecule.Molecule().from_adjacency_list(adj_lists[i])
635+
except ValueError:
636+
item = rmgpy.molecule.Group().from_adjacency_list(adj_lists[i])
637+
reconstructed_items.append(item)
638+
assert len(cov_data) == len(reconstructed_items)
639+
640+
n_groups = np.sum([isinstance(mol, rmgpy.molecule.Group) for mol in reconstructed_items])
641+
n_mols = np.sum([isinstance(mol, rmgpy.molecule.Molecule) for mol in reconstructed_items])
642+
643+
644+
# quick check to make sure the covariance data and molecule data are consistent with each other
645+
if cov_data.shape[0] != len(cov_specs):
646+
raise ValueError(f'Covariance data and molecule data in library {cov_lib} are inconsistent: covariance data has shape {cov_data.shape} but molecule data has length {len(cov_specs)}')
647+
648+
# load the labels, but only include species in the model
649+
subset_indices = [] # keep track of indices relevant to the model
650+
for i_lib, lib_species in enumerate(cov_specs):
651+
i_sp = get_i_thing(lib_species, self.species_list)
652+
if i_sp < 0:
653+
continue
654+
655+
# make sure the species actually comes from this library, otherwise skip
656+
result = self.database.thermo.get_thermo_data_from_library(lib_species, library)
657+
if result is not None:
658+
surface_prefix = 'Surface_' if lib_species.contains_surface_site() else ''
659+
# match the label as constructed in assign_intermediate_uncertainties,
660+
# where the number corresponds to the index of the species in species_list
661+
try:
662+
label = f'{surface_prefix}Library {self.species_list[i_sp].to_chemkin()}'
663+
except IndexError:
664+
label = f'{surface_prefix}Library {self.extra_species[i_sp - len(self.species_list)].to_chemkin()}'
665+
lib_species.label = label
666+
subset_indices.append(i_lib)
667+
668+
# fill in the dictionary of covariances from the covariance libraries,
669+
# with keys being sorted tuples of the labels of the correlated parameters
670+
# and values being the covariance between those parameters
671+
tolerance = 1e-12 # consider anything with covariance less than this to be uncorrelated
672+
for i, index_i in enumerate(subset_indices):
673+
for j in range(i, len(subset_indices)):
674+
index_j = subset_indices[j]
675+
if cov_data[index_i, index_j] > tolerance:
676+
label1 = cov_specs[index_i].label
677+
label2 = cov_specs[index_j].label
678+
covariance = cov_data[index_i, index_j]
679+
self.thermo_covariances_dict[tuple(sorted([label1, label2]))] = covariance
680+
510681
def extract_sources_from_model(self):
511682
"""
512683
Extract the source data from the model using its comments.
@@ -626,6 +797,9 @@ def extract_sources_from_model(self):
626797
for spc in self.extra_species:
627798
self.species_list.remove(spc)
628799

800+
# -------------------- load covariance libraries ------------------------#
801+
self.load_thermo_covariances_from_libraries()
802+
629803
def compile_all_sources(self):
630804
"""
631805
Compile two dictionaries composed of all the thermo and kinetic sources. Must
@@ -721,7 +895,7 @@ def assign_parameter_uncertainties(self, g_param_engine=None, k_param_engine=Non
721895
Assign uncertainties based on the sources of the species thermo and reaction kinetics.
722896
"""
723897
if g_param_engine is None:
724-
g_param_engine = ThermoParameterUncertainty()
898+
g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
725899
if k_param_engine is None:
726900
k_param_engine = KineticParameterUncertainty()
727901

@@ -730,7 +904,15 @@ def assign_parameter_uncertainties(self, g_param_engine=None, k_param_engine=Non
730904

731905
for species in self.species_list:
732906
if not correlated:
733-
dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
907+
entry = self.species_sources_dict[species]
908+
if 'Surface_Library' in entry: # preconditioning for covariance
909+
# this is an ugly workaround to handle covariances: because get_uncertainty_value needs the species chemkin string to get the covariance
910+
# but the source dictionary only has the index of the surface library entry
911+
entry_copy = entry.copy()
912+
entry_copy['Surface_Library'] = self.species_list[entry_copy['Surface_Library']].to_chemkin()
913+
dG = g_param_engine.get_uncertainty_value(entry_copy)
914+
else:
915+
dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
734916
self.thermo_input_uncertainties.append(dG)
735917
else:
736918
source = self.species_sources_dict[species]
@@ -854,7 +1036,7 @@ def assign_intermediate_uncertainties(self, g_param_engine=None, k_param_engine=
8541036
But instead of assuming all underlying parameters are independent, here we can allow for dependence as long as we have the covariance
8551037
"""
8561038
if g_param_engine is None:
857-
g_param_engine = ThermoParameterUncertainty()
1039+
g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
8581040
if k_param_engine is None:
8591041
k_param_engine = KineticParameterUncertainty()
8601042

@@ -863,7 +1045,15 @@ def assign_intermediate_uncertainties(self, g_param_engine=None, k_param_engine=
8631045

8641046
for species in self.species_list:
8651047
if not correlated:
866-
dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
1048+
entry = self.species_sources_dict[species]
1049+
if 'Surface_Library' in entry: # preconditioning for covariance
1050+
# this is an ugly workaround to handle covariances: because get_uncertainty_value needs the species chemkin string to get the covariance
1051+
# but the source dictionary only has the index of the surface library entry
1052+
entry_copy = entry.copy()
1053+
entry_copy['Surface_Library'] = self.species_list[entry_copy['Surface_Library']].to_chemkin()
1054+
dG = g_param_engine.get_uncertainty_value(entry_copy)
1055+
else:
1056+
dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
8671057
self.thermo_intermediate_uncertainties.append(dG) # in the uncorrelated case, the intermediate is just the uncertainty value itself, since there is only one parameter that contributes to the uncertainty
8681058
else:
8691059
source = self.species_sources_dict[species]
@@ -1248,7 +1438,7 @@ def get_thermo_covariance_matrix(self, g_param_engine=None):
12481438
self.thermo_covariance_matrix = np.zeros((len(self.species_list), len(self.species_list)))
12491439

12501440
if g_param_engine is None:
1251-
g_param_engine = ThermoParameterUncertainty()
1441+
g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
12521442

12531443
for i in range(len(self.species_list)):
12541444
for j in range((len(self.species_list))):
@@ -1308,7 +1498,7 @@ def _get_intermediate_thermo_covariance_matrix(self, g_param_engine=None, subset
13081498
return self.Sigma_ww_thermo
13091499

13101500
if g_param_engine is None:
1311-
g_param_engine = ThermoParameterUncertainty()
1501+
g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
13121502

13131503
self.all_thermo_intermediates = set()
13141504
for sp_idx in subset_indices:
@@ -1455,3 +1645,9 @@ def process_local_results(results, sensitive_species, number=10):
14551645
output += '================================================================================\n\n'
14561646

14571647
return processed_results, output
1648+
1649+
def get_i_thing(thing, thing_list):
1650+
for i in range(len(thing_list)):
1651+
if thing.is_isomorphic(thing_list[i]):
1652+
return i
1653+
return -1

0 commit comments

Comments
 (0)