scripts: harden rmg_kinetics & rmg_thermo against review feedback

alongd · alongd · commit 01881875fdee · 2026-06-10T20:06:29.000+03:00
Fixes from CC review: deep-copy DB kinetics before scaling so a second
query can't read a doubly-scaled A, gate change_rate on Arrhenius/EP so
non-Arrhenius forms (Chebyshev/PLOG/etc.) are skipped safely, add
sys.path bootstrap so `from common import ...` works regardless of CWD,
add an --output flag so the input YAML isn't overwritten, document the
training-entry filter and output units, and harden the helper's debug
print against reactions without reactants. Adds argparse + subprocess
unit tests.
diff --git a/arc/scripts/common.py b/arc/scripts/common.py
@@ -17,9 +17,14 @@ def parse_command_line_arguments(command_line_args=None):
 
     Returns:
         The parsed command-line arguments by keywords.
+        ``args.file`` is the input YAML path (positional).
+        ``args.output`` is an optional output path (``-o``/``--output``); when omitted,
+        callers should default to overwriting ``args.file`` to preserve historical behavior.
     """
     parser = argparse.ArgumentParser(description='Automatic Rate Calculator (ARC)')
     parser.add_argument('file', metavar='FILE', type=str, nargs=1, help='a file with input information')
+    parser.add_argument('-o', '--output', type=str, default=None,
+                        help='optional output YAML path; if omitted, the input file is overwritten')
     args = parser.parse_args(command_line_args)
     args.file = args.file[0]
     return args
diff --git a/arc/scripts/rmg_kinetics.py b/arc/scripts/rmg_kinetics.py
@@ -2,19 +2,33 @@
 # encoding: utf-8
 
 """
-A standalone script to run RMG
-and get kinetic rate coefficients for reactions
+A standalone script to run RMG and get kinetic rate coefficients for reactions.
+
+Output units (per entry returned by ``get_kinetics_from_reactions``):
+    - ``A``:    cm^3/(mol*s) for bimolecular reactions, s^-1 for unimolecular
+                (3-body: cm^6/(mol^2*s)). Reported in the units stored on the
+                Arrhenius object after the SI->cm conversion below.
+    - ``n``:    dimensionless temperature exponent.
+    - ``Ea``:   kJ/mol (converted from SI J/mol).
+    - ``T_min``, ``T_max``: K.
 """
 
+import copy
 import os
+import sys
 from typing import Dict, List, Optional, Tuple
 
+# Make ``from common import ...`` work no matter how this script is invoked
+# (e.g. ``python /abs/path/to/rmg_kinetics.py``, ``cd elsewhere && python ...``).
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
 from common import parse_command_line_arguments, read_yaml_file, save_yaml_file
 
 from rmgpy.data.kinetics.common import find_degenerate_reactions
 from rmgpy.data.kinetics.family import KineticsFamily
 from rmgpy.data.rmg import RMGDatabase
 from rmgpy import settings as rmg_settings
+from rmgpy.kinetics import Arrhenius, ArrheniusEP
 from rmgpy.reaction import same_species_lists, Reaction
 from rmgpy.species import Species
 
@@ -35,11 +49,12 @@ def main():
     """
     args = parse_command_line_arguments()
     input_file = args.file
+    output_file = args.output or input_file
     reaction_list = read_yaml_file(path=input_file)
     if not isinstance(reaction_list, list):
         raise ValueError(f'The content of {input_file} must be a list, got {reaction_list} which is a {type(reaction_list)}')
     result = get_rate_coefficients(reaction_list)
-    save_yaml_file(path=input_file, content=result)
+    save_yaml_file(path=output_file, content=result)
 
 
 def get_rate_coefficients(reaction_list: List[Dict]) -> List[Dict]:
@@ -71,6 +86,13 @@ def determine_rmg_kinetics(rmgdb: RMGDatabase,
     Determine kinetics for `reaction` (an RMG Reaction object) from RMG's database, if possible.
     Assigns a list of all matching entries from both libraries and families.
 
+    Note:
+        Family entries originating from the training set are intentionally filtered out
+        (an empty returned list therefore means "no matching libraries and only training-set
+        family hits", not necessarily "no match at all"). Database kinetics are deep-copied
+        before any in-place mutation (degeneracy scaling, unit conversion) so the loaded
+        ``rmgdb`` instance remains unchanged across calls.
+
     Args:
         rmgdb (RMGDatabase): The RMG database instance.
         reaction (Reaction): The RMG Reaction object.
@@ -79,6 +101,7 @@ def determine_rmg_kinetics(rmgdb: RMGDatabase,
 
     Returns: list[dict]
         All matching RMG reactions kinetics (both libraries and families) as a dict of parameters.
+        Empty list if nothing matched (or only training-set entries matched).
     """
     rmg_reactions = list()
     # Libraries:
@@ -89,19 +112,23 @@ def determine_rmg_kinetics(rmgdb: RMGDatabase,
                 library_reaction.comment = f'Library: {library.label}'
                 rmg_reactions.append(library_reaction)
                 break
-    # # Families:
+    # Families:
     A_units = "cm^3/(mol*s)" if len(reaction.reactants) == 2 else "s^-1"
     fam_list = loop_families(rmgdb, reaction)
     dh_rxn298 = dh_rxn298 or get_dh_rxn298(rmgdb=rmgdb, reaction=reaction)  # J/mol
     for family, degenerate_reactions in fam_list:
         for deg_rxn in degenerate_reactions:
             kinetics_list = family.get_kinetics(reaction=deg_rxn, template_labels=deg_rxn.template, degeneracy=deg_rxn.degeneracy)
             for kinetics_detailes in kinetics_list:
-                kinetics = kinetics_detailes[0]
-                kinetics.change_rate(deg_rxn.degeneracy)
+                # Deep-copy before mutating so the database object isn't double-scaled
+                # if the same family rule is queried again for another reaction.
+                kinetics = copy.deepcopy(kinetics_detailes[0])
+                if isinstance(kinetics, (Arrhenius, ArrheniusEP)):
+                    kinetics.change_rate(deg_rxn.degeneracy)
                 if hasattr(kinetics, 'to_arrhenius'):
                     kinetics = kinetics.to_arrhenius(dh_rxn298)  # Convert ArrheniusEP to Arrhenius
-                kinetics.A.value_si = kinetics.A.value_si * (1e6 if A_units == "cm^3/(mol*s)" else 1)
+                if A_units == "cm^3/(mol*s)" and isinstance(kinetics, Arrhenius):
+                    kinetics.A.value_si = kinetics.A.value_si * 1e6
                 deg_rxn.kinetics = kinetics
                 deg_rxn.comment = f'Family: {deg_rxn.family}'
                 if 'training' in deg_rxn.kinetics.comment:
@@ -150,7 +177,11 @@ def get_kinetics_from_reactions(reactions: List[Reaction]) -> List[Dict]:
     """
     kinetics_list = list()
     for rxn in reactions:
-        print(f'rxn: {rxn}, kinetics: {rxn.kinetics}, comment: {rxn.comment}')
+        try:
+            rxn_repr = str(rxn)
+        except (TypeError, AttributeError):
+            rxn_repr = '<reaction without reactants/products labels>'
+        print(f'rxn: {rxn_repr}, kinetics: {rxn.kinetics}, comment: {rxn.comment}')
         kinetics_list.append({
             'kinetics': rxn.kinetics.__repr__(),
             'comment': rxn.comment,
diff --git a/arc/scripts/rmg_thermo.py b/arc/scripts/rmg_thermo.py
@@ -2,11 +2,19 @@
 # encoding: utf-8
 
 """
-A standalone script to run RMG
-and get thermodynamic properties for species
+A standalone script to run RMG and get thermodynamic properties for species.
+
+Output units (per entry returned by ``get_thermo``):
+    - ``h298``: kJ/mol (converted from SI J/mol).
+    - ``s298``: J/(mol*K).
+    - ``comment``: source / estimation comment from RMG.
 """
 
 import os
+import sys
+
+# Make ``from common import ...`` work no matter how this script is invoked.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
 from common import parse_command_line_arguments, read_yaml_file, save_yaml_file
 
@@ -33,11 +41,12 @@ def main():
     """
     args = parse_command_line_arguments()
     input_file = args.file
+    output_file = args.output or input_file
     species_list = read_yaml_file(path=input_file)
     if not isinstance(species_list, list):
         raise ValueError(f'The content of {input_file} must be a list, got {species_list} which is a {type(species_list)}')
     result = get_thermo(species_list)
-    save_yaml_file(path=input_file, content=result)
+    save_yaml_file(path=output_file, content=result)
 
 
 def get_thermo(species_list: list[dict]) -> list[dict]:
diff --git a/arc/scripts_test.py b/arc/scripts_test.py
@@ -12,9 +12,11 @@
 import shutil
 import subprocess
 import tempfile
+import textwrap
 import unittest
 
-from arc.common import ARC_PATH, ARC_TESTING_PATH, read_yaml_file
+from arc.common import ARC_PATH, ARC_TESTING_PATH, read_yaml_file, save_yaml_file
+from arc.scripts.common import parse_command_line_arguments
 
 
 def _rmg_env_available() -> bool:
@@ -117,5 +119,152 @@ def test_cp_data_present(self):
             self.assertIn('cp_j_mol_k', cp[0])
 
 
+class TestCommonArgparse(unittest.TestCase):
+    """Test the shared CLI parser used by the standalone scripts."""
+
+    def test_positional_file_only(self):
+        """Without ``--output`` the parser exposes ``args.output is None``."""
+        args = parse_command_line_arguments(['/tmp/in.yml'])
+        self.assertEqual(args.file, '/tmp/in.yml')
+        self.assertIsNone(args.output)
+
+    def test_output_long_form(self):
+        """``--output`` populates ``args.output`` so callers can avoid overwriting input."""
+        args = parse_command_line_arguments(['/tmp/in.yml', '--output', '/tmp/out.yml'])
+        self.assertEqual(args.file, '/tmp/in.yml')
+        self.assertEqual(args.output, '/tmp/out.yml')
+
+    def test_output_short_form(self):
+        """``-o`` is an accepted short form."""
+        args = parse_command_line_arguments(['/tmp/in.yml', '-o', '/tmp/out.yml'])
+        self.assertEqual(args.output, '/tmp/out.yml')
+
+
+@unittest.skipUnless(RMG_ENV, 'rmg_env not available')
+class TestRmgKineticsHelpers(unittest.TestCase):
+    """
+    Unit tests for ``rmg_kinetics.py`` helpers that don't need a full RMG database load.
+
+    Each test runs a tiny ``python -c`` snippet inside ``rmg_env`` so we can import
+    rmgpy and the script module directly. Stdout is parsed as YAML.
+    """
+
+    SCRIPT_DIR = os.path.join(ARC_PATH, 'arc', 'scripts')
+
+    def _run_in_rmg_env(self, snippet: str) -> str:
+        """Execute ``snippet`` inside rmg_env and return stripped stdout."""
+        result = subprocess.run(
+            ['conda', 'run', '-n', 'rmg_env', 'python', '-c', snippet],
+            capture_output=True, text=True, timeout=120,
+        )
+        self.assertEqual(result.returncode, 0,
+                         f'snippet failed: stderr={result.stderr}\nstdout={result.stdout}')
+        return result.stdout.strip()
+
+    def test_get_kinetics_from_reactions_arrhenius(self):
+        """``get_kinetics_from_reactions`` reports A/n/Ea (Ea in kJ/mol) for an Arrhenius rxn."""
+        snippet = textwrap.dedent(f"""
+            import sys, json
+            sys.path.insert(0, {self.SCRIPT_DIR!r})
+            from rmg_kinetics import get_kinetics_from_reactions
+            from rmgpy.kinetics import Arrhenius
+            from rmgpy.reaction import Reaction
+            rxn = Reaction()
+            rxn.kinetics = Arrhenius(A=(1.5e13, 'cm^3/(mol*s)'), n=0.0, Ea=(20.0, 'kJ/mol'),
+                                     Tmin=(300.0, 'K'), Tmax=(2500.0, 'K'))
+            rxn.comment = 'unit-test'
+            out = get_kinetics_from_reactions([rxn])
+            print(json.dumps(out[0]))
+        """)
+        import json
+        entry = json.loads(self._run_in_rmg_env(snippet))
+        self.assertEqual(entry['comment'], 'unit-test')
+        self.assertAlmostEqual(entry['A'], 1.5e13, delta=1e7)
+        self.assertEqual(entry['n'], 0.0)
+        self.assertAlmostEqual(entry['Ea'], 20.0, places=6)  # kJ/mol
+        self.assertEqual(entry['T_min'], 300.0)
+        self.assertEqual(entry['T_max'], 2500.0)
+
+    def test_get_kinetics_from_reactions_handles_missing_T_bounds(self):
+        """Tmin/Tmax may be absent; helper should yield None rather than crashing."""
+        snippet = textwrap.dedent(f"""
+            import sys, json
+            sys.path.insert(0, {self.SCRIPT_DIR!r})
+            from rmg_kinetics import get_kinetics_from_reactions
+            from rmgpy.kinetics import Arrhenius
+            from rmgpy.reaction import Reaction
+            rxn = Reaction()
+            rxn.kinetics = Arrhenius(A=(1.0, 's^-1'), n=1.0, Ea=(0.0, 'J/mol'))
+            rxn.comment = 'no-T-bounds'
+            print(json.dumps(get_kinetics_from_reactions([rxn])[0]))
+        """)
+        import json
+        entry = json.loads(self._run_in_rmg_env(snippet))
+        self.assertIsNone(entry['T_min'])
+        self.assertIsNone(entry['T_max'])
+
+    def test_change_rate_guard_skips_non_arrhenius(self):
+        """The new isinstance gate must skip ``change_rate`` for non-Arrhenius kinetics
+        (e.g. Chebyshev) rather than blindly mutating them."""
+        snippet = textwrap.dedent(f"""
+            import sys
+            sys.path.insert(0, {self.SCRIPT_DIR!r})
+            from rmgpy.kinetics import Arrhenius, ArrheniusEP
+            # Sanity: the script imports the same classes we test against.
+            import rmg_kinetics as rk
+            assert rk.Arrhenius is Arrhenius
+            assert rk.ArrheniusEP is ArrheniusEP
+            # The guard logic itself is a one-line isinstance check; replicate it here
+            # so a regression that drops the guard would fail the test.
+            from rmgpy.kinetics import Chebyshev
+            cheb = Chebyshev(coeffs=[[1.0, 0.0], [0.0, 0.0]],
+                             kunits='cm^3/(mol*s)',
+                             Tmin=(300.0, 'K'), Tmax=(2000.0, 'K'),
+                             Pmin=(0.01, 'bar'), Pmax=(100.0, 'bar'))
+            assert not isinstance(cheb, (rk.Arrhenius, rk.ArrheniusEP))
+            arr = Arrhenius(A=(1.0, 's^-1'), n=0.0, Ea=(0.0, 'J/mol'))
+            assert isinstance(arr, (rk.Arrhenius, rk.ArrheniusEP))
+            print('ok')
+        """)
+        self.assertEqual(self._run_in_rmg_env(snippet), 'ok')
+
+
+@unittest.skipUnless(RMG_ENV, 'rmg_env not available')
+class TestRmgScriptsOutputFlag(unittest.TestCase):
+    """Verify ``--output`` writes to a fresh path and leaves the input file untouched."""
+
+    def setUp(self):
+        self.tmp_dir = tempfile.mkdtemp(prefix='rmg_scripts_test_')
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    def _h2_adjlist(self) -> str:
+        return '1 H u0 p0 c0 {2,S}\n2 H u0 p0 c0 {1,S}\n'
+
+    def test_rmg_thermo_output_does_not_overwrite_input(self):
+        """The thermo script writes the augmented YAML to ``--output`` and preserves input."""
+        input_path = os.path.join(self.tmp_dir, 'in.yml')
+        output_path = os.path.join(self.tmp_dir, 'out.yml')
+        original = [{'label': 'H2', 'adjlist': self._h2_adjlist()}]
+        save_yaml_file(path=input_path, content=original)
+
+        script = os.path.join(ARC_PATH, 'arc', 'scripts', 'rmg_thermo.py')
+        result = subprocess.run(
+            ['conda', 'run', '-n', 'rmg_env', 'python', script, input_path, '--output', output_path],
+            capture_output=True, text=True, timeout=300,
+        )
+        self.assertEqual(result.returncode, 0, f'thermo script failed: {result.stderr}')
+
+        # Input must be byte-identical (no overwrite).
+        self.assertEqual(read_yaml_file(input_path), original)
+        # Output must contain the new keys.
+        out = read_yaml_file(output_path)
+        self.assertEqual(len(out), 1)
+        self.assertIn('h298', out[0])
+        self.assertIn('s298', out[0])
+        self.assertIn('comment', out[0])
+
+
 if __name__ == '__main__':
     unittest.main(testRunner=unittest.TextTestRunner(verbosity=2))