add ability to load BEEF correlations for UQ

sevyharris · sevyharris · commit 415cb4287d8d · 2026-05-11T12:27:30.000-04:00
diff --git a/rmgpy/tools/uncertainty.py b/rmgpy/tools/uncertainty.py
@@ -28,10 +28,10 @@
 ###############################################################################
 
 import os
-import re
 
 import numpy as np
 
+import rmgpy.data.thermo
 import rmgpy.util as util
 from rmgpy.species import Species
 from rmgpy.tools.data import GenericData
@@ -43,7 +43,7 @@ class ThermoParameterUncertainty(object):
     This class is an engine that generates the species uncertainty based on its thermo sources.
     """
 
-    def __init__(self, dG_library=1.5, dG_QM=3.0, dG_GAV=1.5, dG_group=0.7159, dG_ADS_correction=6.918, dG_surf_lib=6.918):
+    def __init__(self, dG_library=1.5, dG_QM=3.0, dG_GAV=1.5, dG_group=0.7159, dG_ADS_correction=6.918, dG_surf_lib=6.918, other_covariances=None):
         """
         Initialize the different uncertainties dG_library, dG_QM, dG_GAV, and dG_other with set values
         in units of kcal/mol.
@@ -57,6 +57,7 @@ def __init__(self, dG_library=1.5, dG_QM=3.0, dG_GAV=1.5, dG_group=0.7159, dG_AD
         self.dG_group = dG_group
         self.dG_ADS_correction = dG_ADS_correction
         self.dG_surf_lib = dG_surf_lib
+        self.other_covariances = other_covariances  # storage of covariances as a dict. Keys are sorted tuples of parameter labels and values are covariances
 
     def get_uncertainty_value(self, source):
         """
@@ -66,7 +67,14 @@ def get_uncertainty_value(self, source):
         if 'Library' in source:
             varG += self.dG_library * self.dG_library
         if 'Surface_Library' in source:
-            varG += self.dG_surf_lib * self.dG_surf_lib
+            surf_lib_varG = self.dG_surf_lib * self.dG_surf_lib
+            # covariance libraries should overrule the default uncertainties when available
+            if self.other_covariances is not None:
+                label = f'Surface_Library {source["Surface_Library"]}'  # match the covariance dict label format
+                cov_label = (label, label)
+                if cov_label in self.other_covariances:
+                    surf_lib_varG = self.other_covariances[cov_label]
+            varG += surf_lib_varG  # Add the variance of the surface library parameter if covariance is not specified in the covariance libraries
         if 'QM' in source:
             varG += self.dG_QM * self.dG_QM
         if 'GAV' in source:
@@ -172,6 +180,12 @@ def _get_covariance_qq(self, q_label1, q_label2):
         if corr_type1 is None or corr_type2 is None:
             raise ValueError(f'Could not determine the type of the correlated parameters from their labels {q_label1} and {q_label2}')
 
+        if self.other_covariances is not None:
+            # check if covariance is specified in other_covariances dict
+            sorted_labels = tuple(sorted([q_label1, q_label2]))
+            if sorted_labels in self.other_covariances:
+                return self.other_covariances[sorted_labels]
+
         if corr_type1 != corr_type2:
             return 0
         elif q_label1 == q_label2:
@@ -394,11 +408,15 @@ class Uncertainty(object):
     for a single RMG-generated mechanism.
     """
 
-    def __init__(self, species_list=None, reaction_list=None, output_directory=''):
+    def __init__(self, species_list=None, reaction_list=None, output_directory='', thermo_covariance_libraries=None,
+                 kinetic_covariance_libraries=None, thermo_covariance_groups=None):
         """
         `species_list`: list of RMG species objects
         `reaction_list`: list of RMG reaction objects
         `outputDirectoy`: directory path for saving output files from the analyses
+        `thermo_covariance_libraries`: list of library paths to pull additional thermo covariances from
+        `kinetic_covariance_libraries`: list of library paths to pull additional kinetic covariances from
+        `thermo_covariance_groups`: list of groups to get additional thermo covariances from
         """
         self.database = None
         self.species_list = species_list
@@ -418,6 +436,11 @@ def __init__(self, species_list=None, reaction_list=None, output_directory=''):
         self.all_thermo_intermediates = None            # list of labels of underlying thermo parameters
         self.all_kinetics_intermediates = None          # list of labels of underlying kinetic parameters
         self.output_directory = output_directory if output_directory else os.getcwd()
+        self.thermo_covariance_libraries = thermo_covariance_libraries
+        self.thermo_covariance_groups = thermo_covariance_groups
+        self.kinetic_covariance_libraries = kinetic_covariance_libraries
+        self.thermo_covariances_dict = {}  # dictionary to store covariances from covariance libraries
+        self.kinetic_covariances_dict = {}  # dictionary to store covariances from covariance libraries
 
         # For extra species needed for correlated analysis but not in model
         self.extra_species = []
@@ -507,6 +530,154 @@ def retrieve_saturated_species_from_list(self, species):
         else:
             raise Exception('Could not retrieve saturated species form of {0} from the species list'.format(species))
 
+    def load_thermo_covariances_from_libraries(self):
+        from rmgpy.chemkin import load_species_dictionary
+        assert self.database is not None, 'Must load database before loading covariance libraries, since we need the path to the covariance libraries from the database'
+        if self.thermo_covariance_libraries is not None:
+            for cov_lib in self.thermo_covariance_libraries:
+                library_name = os.path.basename(cov_lib)
+                if library_name in self.database.thermo.libraries:
+                    library = self.database.thermo.libraries[library_name]
+                else:
+                    raise ValueError(f'Thermo covariance library {library_name} not found in the loaded database')
+
+                covariance_file = os.path.join(cov_lib, 'covariance.npy')
+                covariance_species = os.path.join(cov_lib, 'species_dictionary.txt')
+                if not os.path.isfile(covariance_file):
+                    raise ValueError(f'Thermo covariance file {covariance_file} not found in library {cov_lib}')
+                if not os.path.isfile(covariance_species):
+                    raise ValueError(f'Thermo species file {covariance_species} not found in library {cov_lib}')
+                cov_data = np.load(covariance_file)
+                cov_species_dict = load_species_dictionary(covariance_species)
+                cov_specs = [item for _, item in cov_species_dict.items()]
+                
+                # quick check to make sure the covariance data and molecule data are consistent with each other
+                if cov_data.shape[0] != len(cov_specs):
+                    raise ValueError(f'Covariance data and molecule data in library {cov_lib} are inconsistent: covariance data has shape {cov_data.shape} but molecule data has length {len(cov_specs)}')
+
+                # load the labels, but only include species in the model
+                subset_indices = []  # keep track of indices relevant to the model
+                for i_lib, lib_species in enumerate(cov_specs):
+                    i_sp = get_i_thing(lib_species, self.species_list)
+                    if i_sp < 0:
+                        continue
+
+                    # make sure the species actually comes from this library, otherwise skip
+                    result = self.database.thermo.get_thermo_data_from_library(lib_species, library)
+                    if result is not None:
+                        surface_prefix = 'Surface_' if lib_species.contains_surface_site() else ''
+                        # match the label as constructed in assign_intermediate_uncertainties,
+                        # where the number corresponds to the index of the species in species_list
+                        try:
+                            label = f'{surface_prefix}Library {self.species_list[i_sp].to_chemkin()}'
+                        except IndexError:
+                            label = f'{surface_prefix}Library {self.extra_species[i_sp - len(self.species_list)].to_chemkin()}'
+                        lib_species.label = label
+                        subset_indices.append(i_lib)
+
+                # fill in the dictionary of covariances from the covariance libraries,
+                # with keys being sorted tuples of the labels of the correlated parameters
+                # and values being the covariance between those parameters
+                tolerance = 1e-12  # consider anything with covariance less than this to be uncorrelated
+                for i, index_i in enumerate(subset_indices):
+                    for j in range(i, len(subset_indices)):
+                        index_j = subset_indices[j]
+                        if cov_data[index_i, index_j] > tolerance:
+                            label1 = cov_specs[index_i].label
+                            label2 = cov_specs[index_j].label
+                            covariance = cov_data[index_i, index_j]
+                            self.thermo_covariances_dict[tuple(sorted([label1, label2]))] = covariance
+
+    def load_thermo_covariances_from_groups(self):
+        # assumes there might also be covariances associated with library entries
+
+        # associated library is hardcoded for now
+        associated_libraries = {
+            'adsorptionPt111': 'surfaceThermoPt111',
+        }
+        associated_library = None
+
+        assert self.database is not None, 'Must load database before loading covariance groups, since we need the path to the covariance groups/libraries from the database'
+        if self.thermo_covariance_groups is not None:
+            for cov_group_tree in self.thermo_covariance_groups:
+                cov_group_tree_name = os.path.basename(cov_group_tree)
+                if cov_group_tree_name in self.database.thermo.groups:
+                    grouptree = self.database.thermo.groups[cov_group_tree_name]
+                else:
+                    raise ValueError(f'Thermo covariance library {cov_group_tree_name} not found in the loaded database')
+                if cov_group_tree_name in associated_libraries:
+                    library_name = associated_libraries[cov_group_tree_name]
+                    if library_name in self.database.thermo.libraries:
+                        associated_library = self.database.thermo.libraries[library_name]
+                    else:
+                        raise ValueError(f'Associated library {library_name} for covariance group {cov_group_tree_name} not found in the loaded database')
+
+                covariance_file = os.path.join(cov_group_tree, 'covariance.npy')
+                group_database_file = os.path.join(cov_group_tree, 'groups.py')
+                group_database = rmgpy.data.thermo.ThermoGroups()
+                group_database.load(group_database_file)
+
+                covariance_molecules = os.path.join(cov_group_tree, 'molecules.pickle')
+                if not os.path.isfile(covariance_file):
+                    raise ValueError(f'Thermo covariance file {covariance_file} not found in {cov_group_tree}')
+                if not os.path.isfile(covariance_molecules):
+                    raise ValueError(f'Thermo molecules file {covariance_molecules} not found in {cov_group_tree}')
+                cov_data = np.load(covariance_file)
+
+                # reconstruct the groups and molecules stored in the molecules.pickle file
+                
+                with open(covariance_molecules, 'rb') as f:
+                    adj_lists = pickle.load(f)
+                    reconstructed_items = []
+                    for i in range(len(adj_lists)):
+                        try:
+                            item = rmgpy.molecule.Molecule().from_adjacency_list(adj_lists[i])
+                        except ValueError:
+                            item = rmgpy.molecule.Group().from_adjacency_list(adj_lists[i])
+                        reconstructed_items.append(item)
+                assert len(cov_data) == len(reconstructed_items)
+
+                n_groups = np.sum([isinstance(mol, rmgpy.molecule.Group) for mol in reconstructed_items])
+                n_mols = np.sum([isinstance(mol, rmgpy.molecule.Molecule) for mol in reconstructed_items])
+                
+                
+                # quick check to make sure the covariance data and molecule data are consistent with each other
+                if cov_data.shape[0] != len(cov_specs):
+                    raise ValueError(f'Covariance data and molecule data in library {cov_lib} are inconsistent: covariance data has shape {cov_data.shape} but molecule data has length {len(cov_specs)}')
+
+                # load the labels, but only include species in the model
+                subset_indices = []  # keep track of indices relevant to the model
+                for i_lib, lib_species in enumerate(cov_specs):
+                    i_sp = get_i_thing(lib_species, self.species_list)
+                    if i_sp < 0:
+                        continue
+
+                    # make sure the species actually comes from this library, otherwise skip
+                    result = self.database.thermo.get_thermo_data_from_library(lib_species, library)
+                    if result is not None:
+                        surface_prefix = 'Surface_' if lib_species.contains_surface_site() else ''
+                        # match the label as constructed in assign_intermediate_uncertainties,
+                        # where the number corresponds to the index of the species in species_list
+                        try:
+                            label = f'{surface_prefix}Library {self.species_list[i_sp].to_chemkin()}'
+                        except IndexError:
+                            label = f'{surface_prefix}Library {self.extra_species[i_sp - len(self.species_list)].to_chemkin()}'
+                        lib_species.label = label
+                        subset_indices.append(i_lib)
+
+                # fill in the dictionary of covariances from the covariance libraries,
+                # with keys being sorted tuples of the labels of the correlated parameters
+                # and values being the covariance between those parameters
+                tolerance = 1e-12  # consider anything with covariance less than this to be uncorrelated
+                for i, index_i in enumerate(subset_indices):
+                    for j in range(i, len(subset_indices)):
+                        index_j = subset_indices[j]
+                        if cov_data[index_i, index_j] > tolerance:
+                            label1 = cov_specs[index_i].label
+                            label2 = cov_specs[index_j].label
+                            covariance = cov_data[index_i, index_j]
+                            self.thermo_covariances_dict[tuple(sorted([label1, label2]))] = covariance
+
     def extract_sources_from_model(self):
         """
         Extract the source data from the model using its comments.
@@ -626,6 +797,9 @@ def extract_sources_from_model(self):
         for spc in self.extra_species:
             self.species_list.remove(spc)
 
+        # -------------------- load covariance libraries ------------------------#
+        self.load_thermo_covariances_from_libraries()
+
     def compile_all_sources(self):
         """
         Compile two dictionaries composed of all the thermo and kinetic sources.  Must
@@ -721,7 +895,7 @@ def assign_parameter_uncertainties(self, g_param_engine=None, k_param_engine=Non
         Assign uncertainties based on the sources of the species thermo and reaction kinetics.
         """
         if g_param_engine is None:
-            g_param_engine = ThermoParameterUncertainty()
+            g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
         if k_param_engine is None:
             k_param_engine = KineticParameterUncertainty()
 
@@ -730,7 +904,15 @@ def assign_parameter_uncertainties(self, g_param_engine=None, k_param_engine=Non
 
         for species in self.species_list:
             if not correlated:
-                dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
+                entry = self.species_sources_dict[species]
+                if 'Surface_Library' in entry:  # preconditioning for covariance
+                    # this is an ugly workaround to handle covariances: because get_uncertainty_value needs the species chemkin string to get the covariance
+                    # but the source dictionary only has the index of the surface library entry
+                    entry_copy = entry.copy()
+                    entry_copy['Surface_Library'] = self.species_list[entry_copy['Surface_Library']].to_chemkin()
+                    dG = g_param_engine.get_uncertainty_value(entry_copy)
+                else:
+                    dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
                 self.thermo_input_uncertainties.append(dG)
             else:
                 source = self.species_sources_dict[species]
@@ -854,7 +1036,7 @@ def assign_intermediate_uncertainties(self, g_param_engine=None, k_param_engine=
         But instead of assuming all underlying parameters are independent, here we can allow for dependence as long as we have the covariance
         """
         if g_param_engine is None:
-            g_param_engine = ThermoParameterUncertainty()
+            g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
         if k_param_engine is None:
             k_param_engine = KineticParameterUncertainty()
 
@@ -863,7 +1045,15 @@ def assign_intermediate_uncertainties(self, g_param_engine=None, k_param_engine=
 
         for species in self.species_list:
             if not correlated:
-                dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
+                entry = self.species_sources_dict[species]
+                if 'Surface_Library' in entry:  # preconditioning for covariance
+                    # this is an ugly workaround to handle covariances: because get_uncertainty_value needs the species chemkin string to get the covariance
+                    # but the source dictionary only has the index of the surface library entry
+                    entry_copy = entry.copy()
+                    entry_copy['Surface_Library'] = self.species_list[entry_copy['Surface_Library']].to_chemkin()
+                    dG = g_param_engine.get_uncertainty_value(entry_copy)
+                else:
+                    dG = g_param_engine.get_uncertainty_value(self.species_sources_dict[species])
                 self.thermo_intermediate_uncertainties.append(dG)  # in the uncorrelated case, the intermediate is just the uncertainty value itself, since there is only one parameter that contributes to the uncertainty
             else:
                 source = self.species_sources_dict[species]
@@ -1248,7 +1438,7 @@ def get_thermo_covariance_matrix(self, g_param_engine=None):
         self.thermo_covariance_matrix = np.zeros((len(self.species_list), len(self.species_list)))
 
         if g_param_engine is None:
-            g_param_engine = ThermoParameterUncertainty()
+            g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
         
         for i in range(len(self.species_list)):
             for j in range((len(self.species_list))):
@@ -1308,7 +1498,7 @@ def _get_intermediate_thermo_covariance_matrix(self, g_param_engine=None, subset
             return self.Sigma_ww_thermo
 
         if g_param_engine is None:
-            g_param_engine = ThermoParameterUncertainty()
+            g_param_engine = ThermoParameterUncertainty(other_covariances=self.thermo_covariances_dict)
         
         self.all_thermo_intermediates = set()
         for sp_idx in subset_indices:
@@ -1455,3 +1645,9 @@ def process_local_results(results, sensitive_species, number=10):
         output += '================================================================================\n\n'
 
     return processed_results, output
+
+def get_i_thing(thing, thing_list):
+    for i in range(len(thing_list)):
+        if thing.is_isomorphic(thing_list[i]):
+            return i
+    return -1