refactor: drive rbfe ligand pairing by SMILES via complex_pdb_for

FridrichMethod · FridrichMethod · commit 0cc52d6bfe19 · 2026-06-05T00:06:32.000-07:00
Replace the regex-based _match_ligand PDB-to-ligand pairing with a single
mapping function complex_pdb_for(name) -&gt; Path defined in the config cell.
Ligand names from the .smi file are now the source of truth: both prep
passes loop over smiles_dict and resolve each name to its complex PDB.
Mapping logic lives in one editable place; drop the now-unused import re.
diff --git a/examples/openfe/rbfe.ipynb b/examples/openfe/rbfe.ipynb
@@ -14,252 +14,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "fc97de03",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "60677d08ac4a40fdbf84bfae3f780378",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import multiprocessing as mp\n",
-    "import os\n",
-    "import re\n",
-    "from dataclasses import asdict, dataclass\n",
-    "from pathlib import Path\n",
-    "\n",
-    "import MDAnalysis as mda\n",
-    "import mdtraj as md\n",
-    "import openfe\n",
-    "from MDAnalysis.analysis import align\n",
-    "from openfe import (\n",
-    "    AlchemicalNetwork,\n",
-    "    ChemicalSystem,\n",
-    "    ProteinComponent,\n",
-    "    SmallMoleculeComponent,\n",
-    "    SolventComponent,\n",
-    "    Transformation,\n",
-    ")\n",
-    "from openfe.protocols.openmm_rfe import RelativeHybridTopologyProtocol\n",
-    "from openfe.protocols.openmm_septop import SepTopProtocol\n",
-    "from openfe.protocols.openmm_utils.charge_generation import bulk_assign_partial_charges\n",
-    "from openfe.protocols.openmm_utils.omm_settings import OpenFFPartialChargeSettings\n",
-    "from openfe.setup import RBFEAlchemicalNetworkPlanner, RHFEAlchemicalNetworkPlanner\n",
-    "from openfe.setup.atom_mapping import KartografAtomMapper, LomapAtomMapper\n",
-    "from openfe.setup.atom_mapping.lomap_scorers import (\n",
-    "    atomic_number_score,\n",
-    "    default_lomap_score,\n",
-    "    ecr_score,\n",
-    "    mcsr_score,\n",
-    "    mncar_score,\n",
-    ")\n",
-    "from openfe.setup.ligand_network_planning import (\n",
-    "    generate_lomap_network,\n",
-    "    generate_maximal_network,\n",
-    "    generate_minimal_redundant_network,\n",
-    "    generate_minimal_spanning_network,\n",
-    "    generate_network_from_indices,\n",
-    "    generate_network_from_names,\n",
-    "    generate_radial_network,\n",
-    ")\n",
-    "from openfe.utils.atommapping_network_plotting import plot_atommapping_network\n",
-    "from openff.units import unit\n",
-    "from rdkit import Chem\n",
-    "from rdkit.Chem import AllChem\n",
-    "from rdkit.Chem.Descriptors3D import Asphericity\n",
-    "\n",
-    "from mdpp.plots import draw_mols, make_atom_labels_3d, view_mol_3d, view_traj_3d\n",
-    "from mdpp.prep import assign_topology, fix_pdb"
-   ]
+   "outputs": [],
+   "source": "import multiprocessing as mp\nimport os\nfrom dataclasses import asdict, dataclass\nfrom pathlib import Path\n\nimport MDAnalysis as mda\nimport mdtraj as md\nimport openfe\nfrom MDAnalysis.analysis import align\nfrom openfe import (\n    AlchemicalNetwork,\n    ChemicalSystem,\n    ProteinComponent,\n    SmallMoleculeComponent,\n    SolventComponent,\n    Transformation,\n)\nfrom openfe.protocols.openmm_rfe import RelativeHybridTopologyProtocol\nfrom openfe.protocols.openmm_septop import SepTopProtocol\nfrom openfe.protocols.openmm_utils.charge_generation import bulk_assign_partial_charges\nfrom openfe.protocols.openmm_utils.omm_settings import OpenFFPartialChargeSettings\nfrom openfe.setup import RBFEAlchemicalNetworkPlanner, RHFEAlchemicalNetworkPlanner\nfrom openfe.setup.atom_mapping import KartografAtomMapper, LomapAtomMapper\nfrom openfe.setup.atom_mapping.lomap_scorers import (\n    atomic_number_score,\n    default_lomap_score,\n    ecr_score,\n    mcsr_score,\n    mncar_score,\n)\nfrom openfe.setup.ligand_network_planning import (\n    generate_lomap_network,\n    generate_maximal_network,\n    generate_minimal_redundant_network,\n    generate_minimal_spanning_network,\n    generate_network_from_indices,\n    generate_network_from_names,\n    generate_radial_network,\n)\nfrom openfe.utils.atommapping_network_plotting import plot_atommapping_network\nfrom openff.units import unit\nfrom rdkit import Chem\nfrom rdkit.Chem import AllChem\nfrom rdkit.Chem.Descriptors3D import Asphericity\n\nfrom mdpp.plots import draw_mols, make_atom_labels_3d, view_mol_3d, view_traj_3d\nfrom mdpp.prep import assign_topology, fix_pdb"
   },
   {
    "cell_type": "markdown",
    "id": "deb54778",
    "metadata": {},
-   "source": [
-    "## Configuration\n",
-    "\n",
-    "Define input paths (complex PDBs, SMILES file), chain identifiers, and simulation parameters (temperature, pH, lambda windows, production length). All intermediate files are written under a working directory."
-   ]
+   "source": "## Configuration\n\nDefine input paths (complex PDBs, SMILES file), chain identifiers, and simulation parameters (temperature, pH, lambda windows, production length). The `complex_pdb_for` function maps each ligand name -- taken from the SMILES file, which is the single source of truth -- to its complex PDB file; to change how names resolve to PDBs, edit only this function. All intermediate files are written under a working directory."
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "0c51edf0",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "COMPLEX_DIR = Path(\"pdbs\")\n",
-    "SMILES_PATH = Path(\"ligands_amp_fep.smi\")\n",
-    "PROTEIN_CHAIN_ID = \"A\"\n",
-    "LIGAND_CHAIN_ID = \"B\"\n",
-    "\n",
-    "WORKING_DIR = Path(\"tmp\")\n",
-    "WORKING_DIR.mkdir(exist_ok=True)\n",
-    "\n",
-    "ALIGN_DIR = WORKING_DIR / \"align\"\n",
-    "ALIGN_DIR.mkdir(exist_ok=True)\n",
-    "PROTEIN_PATH = WORKING_DIR / \"protein.pdb\"\n",
-    "FIXED_PROTEIN_PATH = WORKING_DIR / \"protein_fixed.pdb\"\n",
-    "LIGAND_DIR = WORKING_DIR / \"ligands\"\n",
-    "LIGAND_DIR.mkdir(exist_ok=True)\n",
-    "CHARGED_LIGAND_DIR = WORKING_DIR / \"charged_ligands\"\n",
-    "CHARGED_LIGAND_DIR.mkdir(exist_ok=True)\n",
-    "LIGAND_NETWORK_PATH = WORKING_DIR / \"ligand_network.graphml\"\n",
-    "TRANSFORMATION_DIR = WORKING_DIR / \"transformations\"\n",
-    "TRANSFORMATION_DIR.mkdir(exist_ok=True)\n",
-    "SEPTOP_TRANSFORMATION_DIR = WORKING_DIR / \"septop_transformations\"\n",
-    "SEPTOP_TRANSFORMATION_DIR.mkdir(exist_ok=True)\n",
-    "\n",
-    "# Simulation parameters\n",
-    "LAMBDA_WINDOWS = 51\n",
-    "N_REPLICAS = LAMBDA_WINDOWS\n",
-    "PH = 7.0\n",
-    "PRODUCTION_LENGTH = 20.0\n",
-    "REPEATS = 1\n",
-    "TEMPERATURE = 298.15\n",
-    "\n",
-    "LOMAP_MAPPER_SEED = \"O=C(CCCC[C,N])OP(OC[C@H]1O[C@@H](n2cnc3c2ncnc3N)[C@H](O)[C@@H]1O)([O-])=O\"\n",
-    "\n",
-    "\n",
-    "@dataclass\n",
-    "class LomapAtomMapperConfig:\n",
-    "    \"\"\"Configuration for the Lomap atom mapper.\"\"\"\n",
-    "\n",
-    "    time: int = 20\n",
-    "    threed: bool = True\n",
-    "    max3d: float = 1.0\n",
-    "    element_change: bool = False  # Do not allow mappings that change an atoms element\n",
-    "    seed: str = LOMAP_MAPPER_SEED  # SMARTS string for MCS search\n",
-    "    shift: bool = False\n",
-    "\n",
-    "\n",
-    "@dataclass\n",
-    "class KartografAtomMapperConfig:\n",
-    "    \"\"\"Configuration for the Kartograf atom mapper.\"\"\"\n",
-    "\n",
-    "    atom_max_distance: float = 0.9\n",
-    "    atom_map_hydrogens: bool = True\n",
-    "    map_hydrogens_on_hydrogens_only: bool = True\n",
-    "    map_exact_ring_matches_only: bool = True\n",
-    "    allow_partial_fused_rings: bool = False\n",
-    "    allow_bond_breaks: bool = False"
-   ]
+   "source": "COMPLEX_DIR = Path(\"pdbs\")\nSMILES_PATH = Path(\"ligands_amp_fep.smi\")\nPROTEIN_CHAIN_ID = \"A\"\nLIGAND_CHAIN_ID = \"B\"\n\n\ndef complex_pdb_for(name: str) -> Path:\n    \"\"\"Resolve a ligand name (the SMILES `_Name`) to its complex PDB path.\n\n    Ligand names from `SMILES_PATH` are the single source of truth; this\n    function is the only place that encodes how each name maps to a complex\n    PDB file. To change the mapping logic (a different filename template, an\n    explicit lookup table, multiple input directories, etc.), edit only this\n    function.\n\n    Args:\n        name: Ligand name, i.e. the title field of an entry in `SMILES_PATH`.\n\n    Returns:\n        Path to the complex PDB file for that ligand.\n    \"\"\"\n    return COMPLEX_DIR / f\"FLE03_{name}_3a7r_model_0.pdb\"\n\n\nWORKING_DIR = Path(\"tmp\")\nWORKING_DIR.mkdir(exist_ok=True)\n\nALIGN_DIR = WORKING_DIR / \"align\"\nALIGN_DIR.mkdir(exist_ok=True)\nPROTEIN_PATH = WORKING_DIR / \"protein.pdb\"\nFIXED_PROTEIN_PATH = WORKING_DIR / \"protein_fixed.pdb\"\nLIGAND_DIR = WORKING_DIR / \"ligands\"\nLIGAND_DIR.mkdir(exist_ok=True)\nCHARGED_LIGAND_DIR = WORKING_DIR / \"charged_ligands\"\nCHARGED_LIGAND_DIR.mkdir(exist_ok=True)\nLIGAND_NETWORK_PATH = WORKING_DIR / \"ligand_network.graphml\"\nTRANSFORMATION_DIR = WORKING_DIR / \"transformations\"\nTRANSFORMATION_DIR.mkdir(exist_ok=True)\nSEPTOP_TRANSFORMATION_DIR = WORKING_DIR / \"septop_transformations\"\nSEPTOP_TRANSFORMATION_DIR.mkdir(exist_ok=True)\n\n# Simulation parameters\nLAMBDA_WINDOWS = 51\nN_REPLICAS = LAMBDA_WINDOWS\nPH = 7.0\nPRODUCTION_LENGTH = 20.0\nREPEATS = 1\nTEMPERATURE = 298.15\n\nLOMAP_MAPPER_SEED = \"O=C(CCCC[C,N])OP(OC[C@H]1O[C@@H](n2cnc3c2ncnc3N)[C@H](O)[C@@H]1O)([O-])=O\"\n\n\n@dataclass\nclass LomapAtomMapperConfig:\n    \"\"\"Configuration for the Lomap atom mapper.\"\"\"\n\n    time: int = 20\n    threed: bool = True\n    max3d: float = 1.0\n    element_change: bool = False  # Do not allow mappings that change an atoms element\n    seed: str = LOMAP_MAPPER_SEED  # SMARTS string for MCS search\n    shift: bool = False\n\n\n@dataclass\nclass KartografAtomMapperConfig:\n    \"\"\"Configuration for the Kartograf atom mapper.\"\"\"\n\n    atom_max_distance: float = 0.9\n    atom_map_hydrogens: bool = True\n    map_hydrogens_on_hydrogens_only: bool = True\n    map_exact_ring_matches_only: bool = True\n    allow_partial_fused_rings: bool = False\n    allow_bond_breaks: bool = False"
   },
   {
    "cell_type": "markdown",
    "id": "ce5e9dbf",
    "metadata": {},
-   "source": [
-    "## Prepare structures from complex PDB files\n",
-    "\n",
-    "Each complex PDB file contains a protein and ligand. We select the reference protein as the complex bound to the most aspherical ligand (highest Asphericity descriptor), align all other complexes to it, and assign correct bond orders from the SMILES file using `assign_topology`. The reference protein is expected to already be fixed and protonated (via `fix_pdb`) from a prior run."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "70473571",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _match_ligand(stem: str, smi_d: dict[str, Chem.Mol]) -> str | None:\n",
-    "    \"\"\"Return the first SMILES-dict key that appears as a whole word in *stem*.\n",
-    "\n",
-    "    Matching uses word-boundary assertions so that e.g. ``\"lig1\"`` matches\n",
-    "    the stem ``\"complex_lig1_chainA\"`` but not ``\"complex_lig10_chainA\"``.\n",
-    "\n",
-    "    Args:\n",
-    "        stem: PDB filename stem (without extension) to search in.\n",
-    "        smi_d: Mapping of ligand names to RDKit template molecules.\n",
-    "\n",
-    "    Returns:\n",
-    "        The matched ligand name, or ``None`` if no key matches.\n",
-    "    \"\"\"\n",
-    "    return next(\n",
-    "        (n for n in smi_d if re.search(rf\"(?<![A-Za-z\\d]){re.escape(n)}(?![A-Za-z\\d])\", stem)),\n",
-    "        None,\n",
-    "    )"
-   ]
+   "source": "## Prepare structures from complex PDB files\n\nEach complex PDB file contains a protein and ligand. We iterate over the ligand names from the SMILES file and resolve each to its complex PDB via `complex_pdb_for`. We select the reference protein as the complex bound to the most aspherical ligand (highest Asphericity descriptor), align all other complexes to it, and assign correct bond orders from the SMILES file using `assign_topology`. The reference protein is expected to already be fixed and protonated (via `fix_pdb`) from a prior run."
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "51b8c74c",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Build SMILES template dict from .smi file\n",
-    "smi_suppl = Chem.SmilesMolSupplier(str(SMILES_PATH), sanitize=True)\n",
-    "smiles_dict = {mol.GetProp(\"_Name\"): mol for mol in smi_suppl if mol is not None}\n",
-    "\n",
-    "# Pass 1: extract ligands and compute Asphericity to select reference structure\n",
-    "ligand_asphericity = {}\n",
-    "ligand_volume = {}\n",
-    "for pdb_file in sorted(COMPLEX_DIR.glob(\"*.pdb\")):\n",
-    "    # Pair ligand name with PDB filename; if no match, skip\n",
-    "    lig_name = _match_ligand(pdb_file.stem, smiles_dict)\n",
-    "    if lig_name is None:\n",
-    "        continue\n",
-    "    ligand_pdb = ALIGN_DIR / f\"{lig_name}.pdb\"\n",
-    "    u = mda.Universe(pdb_file)\n",
-    "    u.select_atoms(f\"chainID {LIGAND_CHAIN_ID}\").write(ligand_pdb)\n",
-    "    rdkit_mol = Chem.MolFromPDBFile(str(ligand_pdb), sanitize=True, removeHs=True)\n",
-    "    rdkit_mol = assign_topology(rdkit_mol, smiles_dict[lig_name])\n",
-    "    ligand_asphericity[pdb_file] = Asphericity(rdkit_mol)\n",
-    "    ligand_volume[pdb_file] = AllChem.ComputeMolVolume(rdkit_mol)\n",
-    "\n",
-    "ref_pdb = max(ligand_asphericity, key=ligand_asphericity.get)\n",
-    "print(f\"{'File':<40} {'Asphericity':>12} {'Volume (A^3)':>12}\")\n",
-    "for pdb_file in sorted(ligand_asphericity):\n",
-    "    marker = \" <-- ref\" if pdb_file == ref_pdb else \"\"\n",
-    "    print(\n",
-    "        f\"{pdb_file.name:<40} {ligand_asphericity[pdb_file]:>12.4f}\"\n",
-    "        f\" {ligand_volume[pdb_file]:>12.2f}{marker}\"\n",
-    "    )"
-   ]
+   "source": "# Build SMILES template dict from .smi file\nsmi_suppl = Chem.SmilesMolSupplier(str(SMILES_PATH), sanitize=True)\nsmiles_dict = {mol.GetProp(\"_Name\"): mol for mol in smi_suppl if mol is not None}\n\n# Pass 1: extract ligands and compute Asphericity to select reference structure.\n# Ligand names from the .smi file are the standard; complex_pdb_for resolves\n# each name to its complex PDB file.\nligand_asphericity = {}\nligand_volume = {}\nfor lig_name, template in smiles_dict.items():\n    pdb_file = complex_pdb_for(lig_name)\n    ligand_pdb = ALIGN_DIR / f\"{lig_name}.pdb\"\n    u = mda.Universe(pdb_file)\n    u.select_atoms(f\"chainID {LIGAND_CHAIN_ID}\").write(ligand_pdb)\n    rdkit_mol = Chem.MolFromPDBFile(str(ligand_pdb), sanitize=True, removeHs=True)\n    rdkit_mol = assign_topology(rdkit_mol, template)\n    ligand_asphericity[lig_name] = Asphericity(rdkit_mol)\n    ligand_volume[lig_name] = AllChem.ComputeMolVolume(rdkit_mol)\n\nref_name = max(ligand_asphericity, key=ligand_asphericity.get)\nref_pdb = complex_pdb_for(ref_name)\nprint(f\"{'Ligand':<40} {'Asphericity':>12} {'Volume (A^3)':>12}\")\nfor lig_name in sorted(ligand_asphericity):\n    marker = \" <-- ref\" if lig_name == ref_name else \"\"\n    print(\n        f\"{lig_name:<40} {ligand_asphericity[lig_name]:>12.4f}\"\n        f\" {ligand_volume[lig_name]:>12.2f}{marker}\"\n    )"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "f3bbbd84",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Pass 2: align all complexes to reference, extract ligands and reference protein\n",
-    "ref_u = mda.Universe(ref_pdb)\n",
-    "\n",
-    "for pdb_file in sorted(COMPLEX_DIR.glob(\"*.pdb\")):\n",
-    "    lig_name = _match_ligand(pdb_file.stem, smiles_dict)\n",
-    "    if lig_name is None:\n",
-    "        continue\n",
-    "    mob_u = mda.Universe(pdb_file)\n",
-    "    old_rmsd, new_rmsd = align.alignto(\n",
-    "        mob_u.select_atoms(\"protein and backbone\"),\n",
-    "        ref_u.select_atoms(\"protein and backbone\"),\n",
-    "    )\n",
-    "    ligand_pdb = ALIGN_DIR / f\"{lig_name}_aligned.pdb\"\n",
-    "    mob_u.select_atoms(f\"chainID {LIGAND_CHAIN_ID}\").write(ligand_pdb)\n",
-    "    rdkit_mol = Chem.MolFromPDBFile(str(ligand_pdb), removeHs=True)\n",
-    "    rdkit_mol = assign_topology(rdkit_mol, smiles_dict[lig_name])\n",
-    "    rdkit_mol.SetProp(\"_Name\", lig_name)\n",
-    "    Chem.MolToMolFile(rdkit_mol, str(LIGAND_DIR / f\"{lig_name}.sdf\"))\n",
-    "\n",
-    "    print(f\"{lig_name}  (RMSD: {old_rmsd:.3f} -> {new_rmsd:.3f} A)\")\n",
-    "    view_mol_3d(rdkit_mol, width=400, height=300, show=True)\n",
-    "    if pdb_file == ref_pdb:\n",
-    "        mob_u.select_atoms(f\"chainID {PROTEIN_CHAIN_ID}\").write(PROTEIN_PATH)"
-   ]
+   "source": "# Pass 2: align all complexes to reference, extract ligands and reference protein\nref_u = mda.Universe(ref_pdb)\n\nfor lig_name, template in smiles_dict.items():\n    pdb_file = complex_pdb_for(lig_name)\n    mob_u = mda.Universe(pdb_file)\n    old_rmsd, new_rmsd = align.alignto(\n        mob_u.select_atoms(\"protein and backbone\"),\n        ref_u.select_atoms(\"protein and backbone\"),\n    )\n    ligand_pdb = ALIGN_DIR / f\"{lig_name}_aligned.pdb\"\n    mob_u.select_atoms(f\"chainID {LIGAND_CHAIN_ID}\").write(ligand_pdb)\n    rdkit_mol = Chem.MolFromPDBFile(str(ligand_pdb), removeHs=True)\n    rdkit_mol = assign_topology(rdkit_mol, template)\n    rdkit_mol.SetProp(\"_Name\", lig_name)\n    Chem.MolToMolFile(rdkit_mol, str(LIGAND_DIR / f\"{lig_name}.sdf\"))\n\n    print(f\"{lig_name}  (RMSD: {old_rmsd:.3f} -> {new_rmsd:.3f} A)\")\n    view_mol_3d(rdkit_mol, width=400, height=300, show=True)\n    if lig_name == ref_name:\n        mob_u.select_atoms(f\"chainID {PROTEIN_CHAIN_ID}\").write(PROTEIN_PATH)"
   },
   {
    "cell_type": "code",