Merge pull request #2917 to change Blowers-Masel fitting algorithm (when training kinetics trees)

rwest · web-flow · commit 33e1e9577a51 · 2026-04-15T09:30:21.000-04:00
Fix Blowers-Masel fitting: initial guesses, negative E0 handling, and spurious fit rejection
Collected and rebased fixes from David Farina and Nora Khalil (some dating to 2021) for 
ArrheniusBM and ArrheniusChargeTransferBM, with additional refactoring

Main behavioral change is that it now reject BM fits where abs(n) &gt; 5.0 in _make_rule, 
as these indicate reactions not following BM enthalpy-dependence.

Other changes are mostly optimizations and simplifications and edge cases.
diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py
@@ -3678,19 +3678,22 @@ def cross_validate(self, folds=5, template_rxn_map=None, test_rxn_inds=None, T=1
 
                 uncertainties[rxn] = self.rules.entries[entry.label][0].data.uncertainty
 
-
                 L = list(set(template_rxn_map[entry.label]) - set(rxns_test))
 
                 if L != []:
                     if isinstance(L[0].kinetics, Arrhenius):
                         kinetics = ArrheniusBM().fit_to_reactions(L, recipe=self.forward_recipe.actions)
-                        if kinetics.E0.value_si < 0.0 or len(L) == 1:
+                        # These conditions (eg. n>5) for rejecting the BM fit should correspond
+                        # to those in _marke_rule, and those just below:
+                        if len(L) == 1 or kinetics.E0.value_si < 0.0 or abs(kinetics.n.value_si) > 5.0:
                             kinetics = average_kinetics([r.kinetics for r in L])
                         else:
                             kinetics = kinetics.to_arrhenius(rxn.get_enthalpy_of_reaction(298.))
                     else:
                         kinetics = ArrheniusChargeTransferBM().fit_to_reactions(L, recipe=self.forward_recipe.actions)
-                        if kinetics.E0.value_si < 0.0 or len(L) == 1:
+                        # These conditions (eg. n>5) for rejecting the BM fit should correspond
+                        # to those in _marke_rule, and those just above:
+                        if len(L) == 1 or kinetics.E0.value_si < 0.0 or abs(kinetics.n.value_si) > 5.0:
                             kinetics = average_kinetics([r.kinetics for r in L])
                         else:
                             kinetics = kinetics.to_arrhenius_charge_transfer(rxn.get_enthalpy_of_reaction(298.))
@@ -4596,79 +4599,93 @@ def _make_rule(rr):
     for i, rxn in enumerate(rxns):
         rxn.rank = ranks[i]
     rxns = np.array(rxns)
-    rs = np.array([r for r in rxns if type(r.kinetics) != KineticsModel])
+    rs = np.array([r for r in rxns if type(r.kinetics) is not KineticsModel]) # KineticsModel is the base class with no data.
     n = len(rs)
-    if n > 0 and isinstance(rs[0].kinetics, Marcus):
+    if n == 0:
+        return None
+
+    if isinstance(rs[0].kinetics, Marcus):
         kin = average_kinetics([r.kinetics for r in rs])
         return kin
     data_mean = np.mean(np.log([r.kinetics.get_rate_coefficient(Tref) for r in rs]))
-    if n > 0:
-        if isinstance(rs[0].kinetics, Arrhenius):
-            arr = ArrheniusBM
-        else:
-            arr = ArrheniusChargeTransferBM
-        if n > 1:
-            kin = arr().fit_to_reactions(rs, recipe=recipe)
-        if n == 1 or kin.E0.value_si < 0.0:
-            kin = average_kinetics([r.kinetics for r in rs])
-            #kin.comment = "Only one reaction or Arrhenius BM fit bad. Instead averaged from {} reactions.".format(n)
-            if n == 1:
-                kin.uncertainty = RateUncertainty(mu=0.0, var=(np.log(fmax) / 2.0) ** 2, N=1, Tref=Tref, data_mean=data_mean, correlation=label)
+
+    if isinstance(rs[0].kinetics, Arrhenius):
+        arr = ArrheniusBM
+    else:
+        arr = ArrheniusChargeTransferBM
+    if n > 1:
+        kin = arr().fit_to_reactions(rs, recipe=recipe)
+
+    # If you update the following conditions, also update the cross_validate function.
+    if n == 1 or kin.E0.value_si < 0.0 or abs(kin.n.value_si) > 5.0:
+        if n > 1: #we have an ArrheniusBM
+            if kin.E0.value_si < 0.0:
+                reason = "E0<0"
+            elif abs(kin.n.value_si) > 5.0:
+                reason = "abs(n)>5"
             else:
+                reason = "?"
+
+        # still run it through the averaging function when n=1 to standardize the units and run checks
+        kin = average_kinetics([r.kinetics for r in rs])
+
+        if n == 1:
+            kin.uncertainty = RateUncertainty(mu=0.0, var=(np.log(fmax) / 2.0) ** 2, N=1, Tref=Tref, data_mean=data_mean, correlation=label)
+        else:
+            kin.comment = f"Blowers-Masel fit was bad ({reason}) so instead averaged from {n} reactions."
+            dlnks = np.array([
+                np.log(
+                        average_kinetics([r.kinetics for r in np.delete(rs, i)]).get_rate_coefficient(T=Tref) / rxn.get_rate_coefficient(T=Tref)
+                    ) for i, rxn in enumerate(rs)
+                ])  # 1) fit to set of reactions without the current reaction (k)  2) compute log(kfit/kactual) at Tref
+            varis = (np.array([rank_accuracy_map[rxn.rank].value_si for rxn in rs]) / (2.0 * constants.R * Tref)) ** 2
+            # weighted average calculations
+            ws = 1.0 / varis
+            V1 = ws.sum()
+            V2 = (ws ** 2).sum()
+            mu = np.dot(ws, dlnks) / V1
+            s = np.sqrt(np.dot(ws, (dlnks - mu) ** 2) / (V1 - V2 / V1))
+            kin.uncertainty = RateUncertainty(mu=mu, var=s ** 2, N=n, Tref=Tref, data_mean=data_mean, correlation=label)
+    else:
+        if n == 1:
+            kin.uncertainty = RateUncertainty(mu=0.0, var=(np.log(fmax) / 2.0) ** 2, N=1, Tref=Tref, data_mean=data_mean, correlation=label)
+        else:
+            if isinstance(rs[0].kinetics, Arrhenius):
                 dlnks = np.array([
                     np.log(
-                            average_kinetics([r.kinetics for r in rs[list(set(range(len(rs))) - {i})]]).get_rate_coefficient(T=Tref) / rxn.get_rate_coefficient(T=Tref)
-                        ) for i, rxn in enumerate(rs)
-                    ])  # 1) fit to set of reactions without the current reaction (k)  2) compute log(kfit/kactual) at Tref
-                varis = (np.array([rank_accuracy_map[rxn.rank].value_si for rxn in rs]) / (2.0 * 8.314 * Tref)) ** 2
-                # weighted average calculations
-                ws = 1.0 / varis
-                V1 = ws.sum()
-                V2 = (ws ** 2).sum()
-                mu = np.dot(ws, dlnks) / V1
-                s = np.sqrt(np.dot(ws, (dlnks - mu) ** 2) / (V1 - V2 / V1))
-                kin.uncertainty = RateUncertainty(mu=mu, var=s ** 2, N=n, Tref=Tref, data_mean=data_mean, correlation=label)
-        else:
-            if n == 1:
-                kin.uncertainty = RateUncertainty(mu=0.0, var=(np.log(fmax) / 2.0) ** 2, N=1, Tref=Tref, data_mean=data_mean, correlation=label)
+                        arr().fit_to_reactions(np.delete(rs, i), recipe=recipe)
+                .to_arrhenius(rxn.get_enthalpy_of_reaction(Tref))
+                .get_rate_coefficient(T=Tref) / rxn.get_rate_coefficient(T=Tref)
+            ) for i, rxn in enumerate(rs)
+        ])  # 1) fit to set of reactions without the current reaction (k)  2) compute log(kfit/kactual) at Tref
             else:
-                if isinstance(rs[0].kinetics, Arrhenius):
-                    dlnks = np.array([
-                        np.log(
-                            arr().fit_to_reactions(rs[list(set(range(len(rs))) - {i})], recipe=recipe)
-                    .to_arrhenius(rxn.get_enthalpy_of_reaction(Tref))
-                    .get_rate_coefficient(T=Tref) / rxn.get_rate_coefficient(T=Tref)
-                ) for i, rxn in enumerate(rs)
-            ])  # 1) fit to set of reactions without the current reaction (k)  2) compute log(kfit/kactual) at Tref
-                else:
-                    dlnks = np.array([
-                        np.log(
-                            arr().fit_to_reactions(rs[list(set(range(len(rs))) - {i})], recipe=recipe)
-                            .to_arrhenius_charge_transfer(rxn.get_enthalpy_of_reaction(Tref))
-                            .get_rate_coefficient(T=Tref) / rxn.get_rate_coefficient(T=Tref)
-                        ) for i, rxn in enumerate(rs)
-                    ])  # 1) fit to set of reactions without the current reaction (k)  2) compute log(kfit/kactual) at Tref
-                varis = (np.array([rank_accuracy_map[rxn.rank].value_si for rxn in rs]) / (2.0 * 8.314 * Tref)) ** 2
-                # weighted average calculations
-                ws = 1.0 / varis
-                V1 = ws.sum()
-                V2 = (ws ** 2).sum()
-                mu = np.dot(ws, dlnks) / V1
-                s = np.sqrt(np.dot(ws, (dlnks - mu) ** 2) / (V1 - V2 / V1))
-                kin.uncertainty = RateUncertainty(mu=mu, var=s ** 2, N=n, Tref=Tref, data_mean=data_mean, correlation=label)
-        
-        #site solute parameters
-        site_datas = [get_site_solute_data(rxn) for rxn in rxns]
-        site_datas = [sdata for sdata in site_datas if sdata is not None]
-        if len(site_datas) > 0:
-            site_data = SoluteTSData()
-            for sdata in site_datas:
-                site_data += sdata
-            site_data = site_data * (1.0/len(site_datas))
-            kin.solute = site_data
-        return kin
-    else:
-        return None
+                dlnks = np.array([
+                    np.log(
+                        arr().fit_to_reactions(np.delete(rs, i), recipe=recipe)
+                        .to_arrhenius_charge_transfer(rxn.get_enthalpy_of_reaction(Tref))
+                        .get_rate_coefficient(T=Tref) / rxn.get_rate_coefficient(T=Tref)
+                    ) for i, rxn in enumerate(rs)
+                ])  # 1) fit to set of reactions without the current reaction (k)  2) compute log(kfit/kactual) at Tref
+            varis = (np.array([rank_accuracy_map[rxn.rank].value_si for rxn in rs]) / (2.0 * constants.R * Tref)) ** 2
+            # weighted average calculations
+            ws = 1.0 / varis
+            V1 = ws.sum()
+            V2 = (ws ** 2).sum()
+            mu = np.dot(ws, dlnks) / V1
+            s = np.sqrt(np.dot(ws, (dlnks - mu) ** 2) / (V1 - V2 / V1))
+            kin.uncertainty = RateUncertainty(mu=mu, var=s ** 2, N=n, Tref=Tref, data_mean=data_mean, correlation=label)
+    
+    #site solute parameters
+    site_datas = [get_site_solute_data(rxn) for rxn in rxns]
+    site_datas = [sdata for sdata in site_datas if sdata is not None]
+    if len(site_datas) > 0:
+        site_data = SoluteTSData()
+        for sdata in site_datas:
+            site_data += sdata
+        site_data = site_data * (1.0/len(site_datas))
+        kin.solute = site_data
+    return kin
+
 
 
 def _spawn_tree_process(family, template_rxn_map, obj, T, nprocs, depth, min_splitable_entry_num, min_rxns_to_spawn, extension_iter_max, extension_iter_item_cap):
@@ -4772,7 +4789,7 @@ def average_kinetics(kinetics_list):
             beta=(beta,"1/m"),
             wr=(wr * 0.001, "kJ/mol"),
             wp=(wp * 0.001, "kJ/mol"),
-            comment="Averaged from {} reactions.".format(len(kinetics_list)),
+            comment=f"Averaged from {len(kinetics_list)} rate expressions.",
             )
     elif isinstance(kinetics, SurfaceChargeTransfer):
         averaged_kinetics = SurfaceChargeTransfer(
@@ -4782,6 +4799,7 @@ def average_kinetics(kinetics_list):
             alpha=alpha,
             V0=(V0,'V'),
             Ea=(Ea * 0.001, "kJ/mol"),
+            comment=f"Averaged from {len(kinetics_list)} rate expressions.",
             )
     elif isinstance(kinetics, ArrheniusChargeTransfer):
         averaged_kinetics = ArrheniusChargeTransfer(
@@ -4791,6 +4809,7 @@ def average_kinetics(kinetics_list):
             alpha=alpha,
             V0=(V0,'V'),
             Ea=(Ea * 0.001, "kJ/mol"),
+            comment=f"Averaged from {len(kinetics_list)} rate expressions.",
             )
     else:
         averaged_kinetics = Arrhenius(
diff --git a/rmgpy/display.py b/rmgpy/display.py
@@ -29,18 +29,18 @@
 
 """
 This module contains only a display() function, which, if you are running in
-the IPython --pylab mode, will render things inline in pretty SVG or PNG graphics.
-If you are NOT running in IPython --pylab mode, it will do nothing.
+a Jupyter notebook or IPython session, will render things inline in pretty SVG or PNG graphics.
+If you are NOT running in such an environment, it will do nothing.
 """
 
 try:
-    from IPython.core.interactiveshell import InteractiveShell
+    from IPython import get_ipython
+    _ipython = get_ipython()
 except ImportError:
+    _ipython = None
+
+if _ipython is not None:
+    from IPython.display import display
+else:
     def display(obj):
         pass
-else:
-    if InteractiveShell.initialized():
-        from IPython.core.display import display
-    else:
-        def display(obj):
-            pass
diff --git a/rmgpy/kinetics/arrhenius.pyx b/rmgpy/kinetics/arrhenius.pyx