env_run: route adapter subprocesses through a conda-family launcher for env isolation

calvinp0 · calvinp0 · commit cb3e4ba304d4 · 2026-05-02T18:50:15.000+03:00
ARC runs inside `arc_env`. The AutoTST, GCN, and TorchANI adapters
shell out to scripts that live in their *own* envs (`tst_env`,
`ts_gcn`, `tani_env`). The previous flow was:

    subprocess.run("source ~/.bashrc; &lt;target_env_python&gt; &lt;script&gt;",
                   shell=True)

which invokes the target env's interpreter directly without
deactivating arc_env first. ARC's exported activation vars
(`BABEL_LIBDIR`, `LD_LIBRARY_PATH`, `CONDA_PREFIX`, ...) stay bound to
arc_env's paths in the child, and shared libraries in the target env
then resolve plugins against the wrong tree — producing ABI-mismatch
crashes (most visibly OpenBabel plugin loading in tst_env).

Add `arc/job/env_run.py` exposing `run_in_conda_env(python, script,
*args)`. It:

- Derives the env *prefix* from the configured python path
  (`&lt;prefix&gt;/bin/python`). Using a prefix path rather than `-n &lt;name&gt;`
  avoids assuming the env lives under a literal `envs/` segment, so
  `CONDA_ENVS_PATH` overrides and bare-prefix layouts (e.g.
  `/scratch/conda_envs/&lt;env&gt;/bin/python`) work without special-casing.
- Detects an available launcher in preference order: the active one
  per `CONDA_EXE` / `MAMBA_EXE`, then `conda` → `mamba` → `micromamba`
  on PATH.
- Selects the right stdio flag per launcher: conda/mamba need
  `--no-capture-output` (so they forward child stdio rather than
  buffering internally); micromamba streams by default and rejects
  the flag, so it is omitted. Decided by launcher basename so symlinks
  and `MAMBA_EXE`-points-at-micromamba setups still get the right
  behavior.
- Captures stdout and stderr (`capture_output=True, text=True`) and
  logs centrally — `logger.warning` on non-zero return with the full
  argv and both streams, `logger.debug` on success — so call sites
  don't each re-implement capture and error reporting. The captured
  streams remain on the returned `CompletedProcess` for callers that
  want to inspect them.
- Passes args as a list — no shell, no quoting concerns.

Routing through `&lt;launcher&gt; run -p &lt;prefix&gt;` triggers the target env's
`activate.d` hooks, which is what rebinds the leaked activation vars
to the correct paths.

Wire the three callers (`torch_ani.py`, `ts/autotst_ts.py`,
`ts/gcn_ts.py`) over to the helper and drop the now-unused
`subprocess` imports. Their `if returncode: logger.warning(...)`
blocks stay — they add job/species context the helper doesn't have.

Add `from __future__ import annotations` to `autotst_script.py` and
`tani_script.py`. Both use PEP 604 union syntax (`X | Y`) for parity
with the rest of ARC, but run under tst_env (Python 3.9) and tani_env
(version varies); the future-import defers annotation evaluation so
they import cleanly on any Python ≥3.7.

Tests in `env_run_test.py` cover prefix derivation for both standard
and `CONDA_ENVS_PATH` layouts, launcher-flag selection by basename,
the `CONDA_EXE`/`MAMBA_EXE` → PATH-lookup detection chain, argv
construction, and the success/failure logging branches.
diff --git a/arc/job/adapters/scripts/autotst_script.py b/arc/job/adapters/scripts/autotst_script.py
@@ -6,6 +6,11 @@
 should be run under the tst_env.
 """
 
+# tst_env is Python 3.9; this script uses PEP 604 union syntax
+# (``str | None``) for parity with the rest of ARC. The future-import
+# defers annotation evaluation so 3.9 doesn't choke at def-time.
+from __future__ import annotations
+
 import argparse
 import numpy as np
 import os
diff --git a/arc/job/adapters/scripts/tani_script.py b/arc/job/adapters/scripts/tani_script.py
@@ -6,6 +6,11 @@
 should be run under the tani environment.
 """
 
+# tani_env's Python may predate PEP 604 (``X | Y`` unions); this
+# future-import defers annotation evaluation so the script imports on
+# any Python ≥3.7 regardless of the env's interpreter version.
+from __future__ import annotations
+
 import argparse
 import os
 import yaml
diff --git a/arc/job/adapters/torch_ani.py b/arc/job/adapters/torch_ani.py
@@ -9,12 +9,12 @@
 import datetime
 import os
 from typing import TYPE_CHECKING
-import subprocess
 
 from arc.common import ARC_PATH, get_logger, is_xyz_linear, save_yaml_file, read_yaml_file
 from arc.imports import settings
 from arc.job.adapter import JobAdapter
 from arc.job.adapters.common import _initialize_adapter
+from arc.job.env_run import run_in_conda_env
 from arc.job.factory import register_job_adapter
 from arc.level import Level
 from arc.settings.settings import tani_default_options_dict
@@ -255,11 +255,12 @@ def execute_incore(self):
             return
 
         self.write_input_file(tani_default_options_dict)
-        commands = ['source ~/.bashrc',
-                   f'{TANI_PYTHON} {TANI_SCRIPT_PATH} '
-                   f'--yml_path {self.local_path}']
-        command = '; '.join(commands)
-        output = subprocess.run(command, shell=True, executable='/bin/bash')
+        # Routed via run_in_conda_env so arc_env's activation vars don't
+        # leak into the child (see arc/job/env_run.py).
+        output = run_in_conda_env(
+            TANI_PYTHON, TANI_SCRIPT_PATH,
+            '--yml_path', self.local_path,
+        )
         if output.returncode:
             logger.warning(f'Torch ANI subprocess ran and did not '
                            f'give a successful return code for {self.job_name}.\n'
diff --git a/arc/job/adapters/ts/autotst_ts.py b/arc/job/adapters/ts/autotst_ts.py
@@ -6,13 +6,13 @@
 
 import datetime
 import os
-import subprocess
 from typing import TYPE_CHECKING
 
 from arc.common import almost_equal_coords, ARC_PATH, get_logger, read_yaml_file
 from arc.imports import settings
 from arc.job.adapter import JobAdapter
 from arc.job.adapters.common import _initialize_adapter
+from arc.job.env_run import run_in_conda_env
 from arc.job.factory import register_job_adapter
 from arc.plotter import save_geo
 from arc.reaction import ARCReaction
@@ -237,14 +237,16 @@ def execute_incore(self):
 
                 i = 0
                 for reaction_label, direction in zip([reaction_label_fwd, reaction_label_rev], ['F', 'R']):
-                    # run AutoTST as a subprocess in the desired direction
+                    # Run AutoTST as a subprocess in the desired direction.
+                    # run_in_conda_env keeps arc_env's activation vars
+                    # (BABEL_LIBDIR, LD_LIBRARY_PATH, ...) from leaking into the
+                    # child and corrupting openbabel plugin loading
+                    # (see arc/job/env_run.py).
                     script_path = os.path.join(ARC_PATH, 'arc', 'job', 'adapters', 'scripts', 'autotst_script.py')
-                    commands = ['source ~/.bashrc', f'"{AUTOTST_PYTHON}" "{script_path}" "{reaction_label}" "{self.output_path}"']
-                    command = '; '.join(commands)
 
                     tic = datetime.datetime.now()
 
-                    output = subprocess.run(command, shell=True, executable='/bin/bash')
+                    output = run_in_conda_env(AUTOTST_PYTHON, script_path, reaction_label, self.output_path)
 
                     tok = datetime.datetime.now() - tic
 
diff --git a/arc/job/adapters/ts/gcn_ts.py b/arc/job/adapters/ts/gcn_ts.py
@@ -9,7 +9,6 @@
 
 import datetime
 import os
-import subprocess
 from typing import TYPE_CHECKING
 
 from rdkit import Chem
@@ -18,6 +17,7 @@
 from arc.imports import settings
 from arc.job.adapter import JobAdapter
 from arc.job.adapters.common import _initialize_adapter
+from arc.job.env_run import run_in_conda_env
 from arc.job.factory import register_job_adapter
 from arc.plotter import save_geo
 from arc.species.converter import rdkit_conf_from_mol, str_to_xyz
@@ -366,13 +366,14 @@ def run_subprocess_locally(direction: str,
                   index=len(ts_species.ts_guesses),
                   )
     tsg.tic()
-    commands = ['source ~/.bashrc',
-                f'{TS_GCN_PYTHON} {GCN_SCRIPT_PATH} '
-                f'--r_sdf_path {product_path} '
-                f'--p_sdf_path {reactant_path} '
-                f'--ts_xyz_path {ts_path}']
-    command = '; '.join(commands)
-    output = subprocess.run(command, shell=True, executable='/bin/bash')
+    # Routed via run_in_conda_env so arc_env's activation vars don't
+    # leak into the child (see arc/job/env_run.py).
+    output = run_in_conda_env(
+        TS_GCN_PYTHON, GCN_SCRIPT_PATH,
+        '--r_sdf_path', product_path,
+        '--p_sdf_path', reactant_path,
+        '--ts_xyz_path', ts_path,
+    )
     if output.returncode:
         logger.warning(f'GCN subprocess ran in the reverse direction did not '
                        f'give a successful return code for {ts_species}.\n'
diff --git a/arc/job/env_run.py b/arc/job/env_run.py
@@ -0,0 +1,144 @@
+"""Invoke a script under a sibling conda/mamba env, isolated from ARC's env.
+
+ARC runs inside ``arc_env``. Several adapters (AutoTST, GCN, TorchANI)
+shell out to scripts that live in their *own* envs (``tst_env``,
+``ts_gcn``, ``tani_env``). Running the target env's ``python``
+binary directly leaves ARC's exported activation vars (``BABEL_LIBDIR``,
+``LD_LIBRARY_PATH``, ``CONDA_PREFIX``, ...) bound to ``arc_env``'s
+paths in the child, which causes ABI-mismatch crashes when shared
+libraries in the child resolve plugins against the wrong env's tree.
+
+Routing through a launcher's ``run`` subcommand makes the launcher
+deactivate the caller env and re-activate the target, so the target
+env's own ``activate.d`` hooks fire and bind those vars to its paths.
+
+Three launchers are supported, in preference order:
+
+1. ``conda`` — needs ``--no-capture-output`` to avoid buffering child
+   stdio.
+2. ``mamba`` — same parser as conda for ``run``; also needs
+   ``--no-capture-output``.
+3. ``micromamba`` — independent C++ reimplementation; streams stdio by
+   default and **rejects** ``--no-capture-output``, so the flag must be
+   omitted.
+
+Buffering matters: without the right flag, conda/mamba hold the child's
+stdout until exit, hiding tracebacks and progress.
+
+The launcher is detected at call time, with the active one (per
+``CONDA_EXE`` / ``MAMBA_EXE``) preferred when available.
+"""
+
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+from arc.common import get_logger
+
+logger = get_logger()
+
+
+def env_prefix_from_python(python_executable: str) -> str:
+    """Derive the env prefix from an interpreter path.
+
+    ARC's settings expose target Python interpreters as full paths
+    (``AUTOTST_PYTHON``, ``TS_GCN_PYTHON``, ``TANI_PYTHON``). The env
+    prefix passed to ``<launcher> run -p <prefix>`` is the directory two
+    levels above the binary (``<prefix>/bin/python``).
+
+    Using a prefix path rather than ``-n <name>`` avoids assuming the
+    env lives under a literal ``envs/`` segment — ``CONDA_ENVS_PATH``
+    and bare-prefix mamba/micromamba layouts (e.g.
+    ``/scratch/conda_envs/<env>/bin/python``) are both fine.
+
+    Validation is lexical, NOT through ``Path.resolve()``: in real
+    conda/mamba/micromamba envs ``<prefix>/bin/python`` is a symlink to
+    ``python3.X``, so resolving first would replace the basename with
+    ``python3.12`` (or similar) and trip the name check. The launcher
+    follows its own interpreter, so all we need here is the prefix
+    string the caller already gave us.
+    """
+    path = Path(python_executable)
+    if path.name != "python" or path.parent.name != "bin":
+        raise ValueError(
+            f"Cannot derive an env prefix from {python_executable!r}; "
+            "expected a path of the form '<prefix>/bin/python'."
+        )
+    return str(path.parent.parent)
+
+
+def _run_flags_for(launcher_path: str) -> list[str]:
+    """Return the per-launcher flags needed for ``run`` to stream stdio.
+
+    Decided by the launcher's basename rather than which env var pointed
+    us at it, so symlinks and odd ``MAMBA_EXE``-points-at-micromamba
+    setups still get the right flag.
+    """
+    name = Path(launcher_path).name
+    if name == "micromamba":
+        return []
+    return ["--no-capture-output"]
+
+
+def _detect_launcher() -> tuple[str, list[str]]:
+    """Return ``(launcher_path, extra_run_flags)``.
+
+    Preference: whichever launcher is active in the current shell
+    (``CONDA_EXE`` / ``MAMBA_EXE``), then conda → mamba → micromamba on
+    PATH.
+    """
+    for env_var in ("CONDA_EXE", "MAMBA_EXE"):
+        path = os.environ.get(env_var)
+        if path and os.path.isfile(path):
+            return path, _run_flags_for(path)
+    for name in ("conda", "mamba", "micromamba"):
+        found = shutil.which(name)
+        if found:
+            return found, _run_flags_for(found)
+    raise FileNotFoundError(
+        "No conda-family launcher (conda / mamba / micromamba) found on "
+        "PATH. ARC's cross-env adapters (AutoTST/GCN/TorchANI) need one "
+        "of these to launch their subprocess scripts in isolated envs."
+    )
+
+
+def run_in_conda_env(
+    python_executable: str,
+    script_path: str,
+    *script_args: str,
+    check: bool = False,
+) -> subprocess.CompletedProcess:
+    """Run ``python script_path *script_args`` inside the env that owns
+    ``python_executable``, isolated from ARC's process env.
+
+    stdout and stderr are captured and logged centrally — debug on
+    success, warning (with both streams and the return code) on
+    non-zero exit — so call sites don't each re-implement capture and
+    error reporting. The captured streams are also exposed on the
+    returned :class:`subprocess.CompletedProcess` (``.stdout`` /
+    ``.stderr``) for callers that need to inspect them. ``check=True``
+    raises ``CalledProcessError`` on non-zero exit. Args are passed as
+    a list, so no shell quoting concerns.
+    """
+    env_prefix = env_prefix_from_python(python_executable)
+    launcher, extra_flags = _detect_launcher()
+    argv = [
+        launcher, "run", *extra_flags,
+        "-p", env_prefix,
+        "python", script_path,
+        *script_args,
+    ]
+    result = subprocess.run(argv, check=check, capture_output=True, text=True)
+    if result.returncode:
+        logger.warning(
+            "env-run: %s exited with %d\ncmd: %s\nstdout:\n%s\nstderr:\n%s",
+            script_path, result.returncode, " ".join(argv),
+            result.stdout, result.stderr,
+        )
+    else:
+        logger.debug(
+            "env-run: %s exited 0\ncmd: %s\nstdout:\n%s\nstderr:\n%s",
+            script_path, " ".join(argv), result.stdout, result.stderr,
+        )
+    return result
diff --git a/arc/job/env_run_test.py b/arc/job/env_run_test.py