Skip to content

Commit 6893428

Browse files
feat: enhance protonation handling in PDB fixing process
- Introduced a new function `_propka_variants` to determine protonation states based on PROPKA predictions, allowing for more accurate hydrogen addition in the `fix_pdb` function. - Updated `fix_pdb` to support a new `protonation` parameter, enabling users to choose between model defaults and PROPKA-driven protonation. - Added comprehensive tests for the new functionality, ensuring correct behavior for various residue types and protonation scenarios. - Improved logging for unsupported residue types during protonation variant mapping.
1 parent e8c9ed6 commit 6893428

2 files changed

Lines changed: 206 additions & 4 deletions

File tree

src/mdpp/prep/protein.py

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,26 @@
55
import logging
66
from dataclasses import dataclass
77
from pathlib import Path
8+
from typing import TYPE_CHECKING, Literal
89

910
import mdtraj as md
1011
from Bio.PDB import Select
1112

1213
from mdpp._types import StrPath
1314

15+
if TYPE_CHECKING:
16+
from openmm.app import Topology
17+
1418
logger = logging.getLogger(__name__)
1519

20+
# OpenMM Modeller.addHydrogens variant names used to apply PROPKA-predicted
21+
# protonation. A residue is overridden only where PROPKA disagrees with the
22+
# model-pKa default (PropkaResult.get_nonstandard); every other residue keeps
23+
# OpenMM's default pH-based selection. Residue types not listed here (e.g.
24+
# termini "N+"/"C-") are left to OpenMM and logged.
25+
_PROTONATED_VARIANT: dict[str, str] = {"ASP": "ASH", "GLU": "GLH", "HIS": "HIP"}
26+
_DEPROTONATED_VARIANT: dict[str, str] = {"LYS": "LYN", "CYS": "CYX", "HIS": "HIE"}
27+
1628

1729
@dataclass(frozen=True, slots=True)
1830
class PropkaResidue:
@@ -136,7 +148,52 @@ def accept_chain(self, chain) -> int: # type: ignore[override]
136148
return int(chain.id in self.chain_ids)
137149

138150

139-
def fix_pdb(pdb_path: StrPath, fixed_pdb_path: StrPath, pH: float = 7.0) -> None:
151+
def _propka_variants(
152+
topology: Topology,
153+
nonstandard: tuple[PropkaResidue, ...],
154+
pH: float,
155+
) -> list[str | None]:
156+
"""Build a per-residue protonation variant list for ``Modeller.addHydrogens``.
157+
158+
Only residues in ``nonstandard`` (where PROPKA disagrees with the model-pKa
159+
default) are overridden to PROPKA's predicted state; every other residue
160+
maps to ``None`` so OpenMM's default pH-based rule applies. Residues whose
161+
type has no supported variant (e.g. termini ``N+``/``C-``) are skipped and
162+
logged.
163+
164+
Args:
165+
topology: OpenMM topology the variant list must align with (one entry
166+
per residue, in topology order).
167+
nonstandard: Residues whose PROPKA-predicted protonation differs from
168+
the model-pKa default.
169+
pH: pH at which protonation states are evaluated.
170+
171+
Returns:
172+
One entry per topology residue: the OpenMM variant name to force, or
173+
``None`` to keep OpenMM's default selection.
174+
"""
175+
overrides: dict[tuple[str, str, str], str] = {}
176+
for residue in nonstandard:
177+
table = _PROTONATED_VARIANT if residue.is_protonated_at(pH) else _DEPROTONATED_VARIANT
178+
variant = table.get(residue.residue_type)
179+
if variant is None:
180+
logger.warning(
181+
"PROPKA protonation for %s has no OpenMM variant; keeping default.",
182+
residue.label,
183+
)
184+
continue
185+
overrides[(residue.chain_id, str(residue.res_num), residue.residue_type)] = variant
186+
logger.info("Applying PROPKA protonation %s -> %s", residue.label, variant)
187+
return [overrides.get((res.chain.id, res.id, res.name)) for res in topology.residues()]
188+
189+
190+
def fix_pdb(
191+
pdb_path: StrPath,
192+
fixed_pdb_path: StrPath,
193+
pH: float = 7.0,
194+
*,
195+
protonation: Literal["model", "propka"] = "model",
196+
) -> None:
140197
"""Fix a PDB file by adding missing residues, atoms, and hydrogens.
141198
142199
Removes heterogens (excluding water by default), identifies missing
@@ -151,6 +208,14 @@ def fix_pdb(pdb_path: StrPath, fixed_pdb_path: StrPath, pH: float = 7.0) -> None
151208
pdb_path: Path to the input PDB file.
152209
fixed_pdb_path: Path where the fixed PDB will be written.
153210
pH: pH value for hydrogen placement.
211+
protonation: Protonation policy. ``"model"`` (default) uses PDBFixer's
212+
built-in model pKa values. ``"propka"`` keeps the model default for
213+
most residues but overrides the residues where PROPKA disagrees
214+
(``PropkaResult.get_nonstandard``) with PROPKA's predicted state,
215+
applied via OpenMM ``Modeller`` variants. Supported overrides are
216+
ASP/GLU/LYS/HIS/CYS (a neutral histidine uses the HIE tautomer);
217+
unsupported residue types (e.g. termini) keep the default and are
218+
logged.
154219
"""
155220
result = run_propka(pdb_path)
156221
nonstandard = result.get_nonstandard(pH)
@@ -169,18 +234,27 @@ def fix_pdb(pdb_path: StrPath, fixed_pdb_path: StrPath, pH: float = 7.0) -> None
169234
lines,
170235
)
171236

172-
from openmm.app import PDBFile
237+
from openmm.app import Modeller, PDBFile
173238
from pdbfixer import PDBFixer
174239

175240
fixer = PDBFixer(filename=str(pdb_path))
176241
fixer.removeHeterogens(keepWater=False)
177242
fixer.findMissingResidues()
178243
fixer.findMissingAtoms()
179244
fixer.addMissingAtoms()
180-
fixer.addMissingHydrogens(pH=pH)
245+
246+
if protonation == "propka" and nonstandard:
247+
# Apply PROPKA-predicted states for the disagreeing residues; OpenMM's
248+
# default pH rule still handles every other residue (None variant).
249+
modeller = Modeller(fixer.topology, fixer.positions)
250+
modeller.addHydrogens(pH=pH, variants=_propka_variants(modeller.topology, nonstandard, pH))
251+
topology, positions = modeller.topology, modeller.positions
252+
else:
253+
fixer.addMissingHydrogens(pH=pH)
254+
topology, positions = fixer.topology, fixer.positions
181255

182256
with Path(fixed_pdb_path).open("w") as f:
183-
PDBFile.writeFile(fixer.topology, fixer.positions, f)
257+
PDBFile.writeFile(topology, positions, f)
184258

185259

186260
def strip_solvent(

tests/prep/test_protein.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""Tests for ``mdpp.prep.protein`` protonation handling (PROPKA + PDBFixer)."""
2+
3+
from __future__ import annotations
4+
5+
import logging
6+
from pathlib import Path
7+
from types import SimpleNamespace
8+
9+
import pytest
10+
11+
from mdpp.prep.protein import (
12+
PropkaResidue,
13+
PropkaResult,
14+
_propka_variants,
15+
fix_pdb,
16+
)
17+
18+
# Minimal single-residue glutamate with crude but valid geometry. PDBFixer
19+
# assigns bonds from residue templates, and Modeller adds the GLH proton (HE2)
20+
# only when the GLU variant is forced -- a clean protonation discriminator.
21+
_GLU_PDB = """ATOM 1 N GLU A 1 0.000 0.000 0.000 1.00 0.00 N
22+
ATOM 2 CA GLU A 1 1.458 0.000 0.000 1.00 0.00 C
23+
ATOM 3 C GLU A 1 2.009 1.420 0.000 1.00 0.00 C
24+
ATOM 4 O GLU A 1 1.251 2.390 0.000 1.00 0.00 O
25+
ATOM 5 CB GLU A 1 1.988 -0.773 -1.209 1.00 0.00 C
26+
ATOM 6 CG GLU A 1 3.508 -0.889 -1.250 1.00 0.00 C
27+
ATOM 7 CD GLU A 1 4.000 -1.700 -2.430 1.00 0.00 C
28+
ATOM 8 OE1 GLU A 1 3.250 -2.560 -2.930 1.00 0.00 O
29+
ATOM 9 OE2 GLU A 1 5.150 -1.480 -2.850 1.00 0.00 O
30+
ATOM 10 OXT GLU A 1 3.330 1.560 0.000 1.00 0.00 O
31+
TER
32+
END
33+
"""
34+
35+
36+
def _fake_topology(residues: list[tuple[str, str, str]]) -> object:
37+
"""Fake OpenMM topology whose ``residues()`` yields (chain.id, id, name) stubs."""
38+
objs = [
39+
SimpleNamespace(chain=SimpleNamespace(id=chain), id=rid, name=name)
40+
for chain, rid, name in residues
41+
]
42+
return SimpleNamespace(residues=lambda: iter(objs))
43+
44+
45+
@pytest.fixture()
46+
def glu_pdb(tmp_path: Path) -> Path:
47+
"""Write the minimal glutamate fixture and return its path."""
48+
path = tmp_path / "glu.pdb"
49+
path.write_text(_GLU_PDB, encoding="utf-8")
50+
return path
51+
52+
53+
# --------------------------------------------------------------------------- #
54+
# _propka_variants (pure mapping logic)
55+
# --------------------------------------------------------------------------- #
56+
def test_propka_variants_overrides_disagreeing_residues() -> None:
57+
nonstandard = (
58+
PropkaResidue("GLU", 54, "A", 7.64, 4.50), # protonated -> GLH
59+
PropkaResidue("LYS", 100, "A", 6.00, 10.50), # deprotonated -> LYN
60+
PropkaResidue("HIS", 267, "A", 7.50, 6.50), # protonated -> HIP
61+
)
62+
topology = _fake_topology([
63+
("A", "53", "ALA"),
64+
("A", "54", "GLU"),
65+
("A", "100", "LYS"),
66+
("A", "267", "HIS"),
67+
])
68+
assert _propka_variants(topology, nonstandard, pH=7.0) == [None, "GLH", "LYN", "HIP"]
69+
70+
71+
def test_propka_variants_cys_deprotonated_to_cyx() -> None:
72+
nonstandard = (PropkaResidue("CYS", 10, "A", 5.0, 9.0),) # deprotonated -> CYX
73+
topology = _fake_topology([("A", "10", "CYS")])
74+
assert _propka_variants(topology, nonstandard, pH=7.0) == ["CYX"]
75+
76+
77+
def test_propka_variants_skips_unsupported_type(caplog: pytest.LogCaptureFixture) -> None:
78+
nonstandard = (PropkaResidue("N+", 1, "A", 9.0, 8.0),) # terminus: no variant
79+
topology = _fake_topology([("A", "1", "MET")])
80+
with caplog.at_level(logging.WARNING):
81+
assert _propka_variants(topology, nonstandard, pH=7.0) == [None]
82+
assert "no OpenMM variant" in caplog.text
83+
84+
85+
def test_propka_variants_empty_is_all_none() -> None:
86+
topology = _fake_topology([("A", "1", "ALA"), ("A", "2", "GLY")])
87+
assert _propka_variants(topology, (), pH=7.0) == [None, None]
88+
89+
90+
# --------------------------------------------------------------------------- #
91+
# fix_pdb dispatch (real PDBFixer + Modeller; PROPKA result is controlled)
92+
# --------------------------------------------------------------------------- #
93+
def _atom_names(pdb_path: Path) -> set[str]:
94+
return {
95+
line[12:16].strip()
96+
for line in pdb_path.read_text().splitlines()
97+
if line.startswith(("ATOM", "HETATM"))
98+
}
99+
100+
101+
def test_fix_pdb_model_leaves_glu_deprotonated(
102+
glu_pdb: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
103+
) -> None:
104+
monkeypatch.setattr("mdpp.prep.protein.run_propka", lambda _p: PropkaResult(()))
105+
out = tmp_path / "model.pdb"
106+
fix_pdb(glu_pdb, out)
107+
names = _atom_names(out)
108+
assert "H" in names # hydrogens were added
109+
assert "HE2" not in names # GLU stays charged under the model default
110+
111+
112+
def test_fix_pdb_propka_protonates_glu(
113+
glu_pdb: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
114+
) -> None:
115+
result = PropkaResult((PropkaResidue("GLU", 1, "A", 7.64, 4.50),))
116+
monkeypatch.setattr("mdpp.prep.protein.run_propka", lambda _p: result)
117+
out = tmp_path / "propka.pdb"
118+
fix_pdb(glu_pdb, out, protonation="propka")
119+
assert "HE2" in _atom_names(out) # PROPKA-driven GLH proton applied
120+
121+
122+
def test_fix_pdb_propka_without_nonstandard_matches_model(
123+
glu_pdb: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
124+
) -> None:
125+
monkeypatch.setattr("mdpp.prep.protein.run_propka", lambda _p: PropkaResult(()))
126+
out = tmp_path / "propka_empty.pdb"
127+
fix_pdb(glu_pdb, out, protonation="propka")
128+
assert "HE2" not in _atom_names(out) # nothing to override -> model default

0 commit comments

Comments
 (0)