diff --git a/mhctools/__init__.py b/mhctools/__init__.py index 29ef8ff..5956735 100644 --- a/mhctools/__init__.py +++ b/mhctools/__init__.py @@ -1,6 +1,7 @@ from .binding_prediction import BindingPrediction from .binding_prediction_collection import BindingPredictionCollection from .pred import ( + FIELD_BEST_DIRECTIONS, Kind, MHC_CLASS_VALUES, MHC_DEPENDENCE_VALUES, @@ -8,6 +9,8 @@ PeptideResult, Pred, Prediction, + VALUE_BEST_DIRECTIONS, + best_direction, preds_from_rows, ) from .sample import MultiSample @@ -72,7 +75,7 @@ def __getattr__(name): raise AttributeError( "module %r has no attribute %r" % (__name__, name)) -__version__ = "3.13.7" +__version__ = "3.14.1" __all__ = [ "Prediction", @@ -82,6 +85,9 @@ def __getattr__(name): "Kind", "MHC_CLASS_VALUES", "MHC_DEPENDENCE_VALUES", + "FIELD_BEST_DIRECTIONS", + "VALUE_BEST_DIRECTIONS", + "best_direction", "preds_from_rows", "MultiSample", "BindingPrediction", diff --git a/mhctools/pred.py b/mhctools/pred.py index 36abfdc..8c82f75 100644 --- a/mhctools/pred.py +++ b/mhctools/pred.py @@ -53,6 +53,72 @@ class Kind: erap_trimming = "erap_trimming" +# Canonical "best direction" for each prediction field. Used by +# downstream aggregators (e.g. "best across alleles" or "best across +# methods") and by the :class:`PeptideResult` ``.best_*`` accessors. +# +# - ``score``: every kind that uses score normalizes higher = better +# (binding strength, presentation likelihood, immunogenicity, ...). +# - ``percentile_rank``: 0 means best, smaller is better, every kind. +# - ``value``: kind-dependent — see :data:`VALUE_BEST_DIRECTIONS`. +FIELD_BEST_DIRECTIONS = { + "score": "max", + "percentile_rank": "min", +} + +# Per-kind direction for ``value``. ``value`` carries the predictor's +# raw output unit (IC50 nM for affinity, half-life for stability, ...); +# whether higher or lower is "better" depends on what the unit means. +# Add an entry when introducing a new ``value``-bearing kind. +VALUE_BEST_DIRECTIONS = { + Kind.pMHC_affinity: "min", # IC50 nM + Kind.pMHC_stability: "max", # half-life +} + + +def best_direction(kind, field) -> str: + """Canonical "best" direction for a ``(kind, field)`` pair. + + Returns ``"max"`` or ``"min"``. + + Parameters + ---------- + kind : str or Kind + Prediction kind (e.g. ``Kind.pMHC_affinity``). + field : str + Column name within the kind's predictions — + ``"score"``, ``"percentile_rank"``, or ``"value"``. + + Raises + ------ + ValueError + If ``field`` is unknown, or if ``field == "value"`` for a kind + without a registered direction in :data:`VALUE_BEST_DIRECTIONS`. + + Notes + ----- + ``score`` and ``percentile_rank`` directions are uniform across all + kinds. ``value`` is kind-dependent: e.g. ``pMHC_affinity`` reports + IC50 in nM (lower better) while ``pMHC_stability`` reports half-life + (higher better). + """ + direction = FIELD_BEST_DIRECTIONS.get(field) + if direction is not None: + return direction + if field == "value": + if kind not in VALUE_BEST_DIRECTIONS: + raise ValueError( + f"best_direction undefined for ({kind!r}, 'value') — " + f"`value` semantics depend on the kind. Add an entry " + f"to mhctools.pred.VALUE_BEST_DIRECTIONS." + ) + return VALUE_BEST_DIRECTIONS[kind] + known = sorted(FIELD_BEST_DIRECTIONS) + ["value"] + raise ValueError( + f"best_direction undefined for field {field!r}. Known: {known}." + ) + + COLUMNS = ( "sample_name", "peptide", @@ -177,27 +243,27 @@ def alleles(self) -> set: @property def affinity(self) -> Optional[Prediction]: """Best affinity prediction, or None.""" - return self._best_by_score(Kind.pMHC_affinity) + return self.best_by_score(Kind.pMHC_affinity) @property def presentation(self) -> Optional[Prediction]: """Best presentation prediction, or None.""" - return self._best_by_score(Kind.pMHC_presentation) + return self.best_by_score(Kind.pMHC_presentation) @property def stability(self) -> Optional[Prediction]: """Best stability prediction, or None.""" - return self._best_by_score(Kind.pMHC_stability) + return self.best_by_score(Kind.pMHC_stability) @property def immunogenicity(self) -> Optional[Prediction]: """Best immunogenicity prediction, or None.""" - return self._best_by_score(Kind.immunogenicity) + return self.best_by_score(Kind.immunogenicity) @property def cleavage(self) -> Optional[Prediction]: """Best proteasomal cleavage prediction, or None.""" - return self._best_by_score(Kind.proteasome_cleavage) + return self.best_by_score(Kind.proteasome_cleavage) # backward compat aliases @property @@ -216,15 +282,15 @@ def best_stability(self) -> Optional[Prediction]: @property def best_affinity_by_rank(self) -> Optional[Prediction]: - return self._best_by_rank(Kind.pMHC_affinity) + return self.best_by_rank(Kind.pMHC_affinity) @property def best_presentation_by_rank(self) -> Optional[Prediction]: - return self._best_by_rank(Kind.pMHC_presentation) + return self.best_by_rank(Kind.pMHC_presentation) @property def best_stability_by_rank(self) -> Optional[Prediction]: - return self._best_by_rank(Kind.pMHC_stability) + return self.best_by_rank(Kind.pMHC_stability) # --- filtering --- @@ -253,20 +319,49 @@ def to_dataframe(self, sample_name=""): return pd.DataFrame(columns=COLUMNS) return pd.DataFrame(rows, columns=COLUMNS) - # --- internals --- - - def _best_by_score(self, kind) -> Optional[Prediction]: - candidates = [p for p in self.preds if p.kind == kind and p.allele] - if not candidates: - # Fall back to preds without allele (e.g. processing predictors) - candidates = [p for p in self.preds if p.kind == kind] - return max(candidates, key=lambda p: p.score) if candidates else None - - def _best_by_rank(self, kind) -> Optional[Prediction]: - candidates = [p for p in self.preds - if p.kind == kind and p.allele - and p.percentile_rank is not None] - return min(candidates, key=lambda p: p.percentile_rank) if candidates else None + # --- best_by (public, direction-aware) --- + + def best_by(self, kind, field) -> Optional[Prediction]: + """Return the prediction of ``kind`` with the best ``field``, or None. + + "Best" direction comes from :func:`best_direction` — + ``score`` is max-better, ``percentile_rank`` is min-better, and + ``value`` is kind-dependent (IC50 lower-better, half-life higher-better). + + Predictions with ``None`` for ``field`` are skipped. Predictions + with an allele are preferred; if none of the matching kind have an + allele, falls back to allele-less predictions (e.g. processing + predictors that emit allele-independent scores). + """ + direction = best_direction(kind, field) + op = max if direction == "max" else min + + def has_value(p): + return getattr(p, field) is not None + + with_allele = [p for p in self.preds + if p.kind == kind and p.allele and has_value(p)] + if with_allele: + return op(with_allele, key=lambda p: getattr(p, field)) + without_allele = [p for p in self.preds + if p.kind == kind and has_value(p)] + if without_allele: + return op(without_allele, key=lambda p: getattr(p, field)) + return None + + def best_by_score(self, kind) -> Optional[Prediction]: + """Best prediction of ``kind`` by ``score`` (max-better).""" + return self.best_by(kind, "score") + + def best_by_rank(self, kind) -> Optional[Prediction]: + """Best prediction of ``kind`` by ``percentile_rank`` (min-better).""" + return self.best_by(kind, "percentile_rank") + + def best_by_value(self, kind) -> Optional[Prediction]: + """Best prediction of ``kind`` by ``value``. Direction is kind-specific + (see :data:`VALUE_BEST_DIRECTIONS`). Raises ``ValueError`` for kinds + without a registered ``value`` direction.""" + return self.best_by(kind, "value") def preds_from_rows(rows, **shared): diff --git a/tests/test_pred.py b/tests/test_pred.py index 9d486a0..dc809e1 100644 --- a/tests/test_pred.py +++ b/tests/test_pred.py @@ -10,13 +10,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest + from mhctools.pred import ( COLUMNS, + FIELD_BEST_DIRECTIONS, Kind, MHC_CLASS_VALUES, MHC_DEPENDENCE_VALUES, PeptideResult, Prediction, + VALUE_BEST_DIRECTIONS, + best_direction, preds_from_rows, ) from mhctools.sample import MultiSample @@ -572,3 +577,149 @@ def test_collection_to_peptide_preds(): assert len(pp_list) == 2 # two distinct peptide positions sizes = sorted(len(pp.preds) for pp in pp_list) assert sizes == [1, 2] + + +# -- best_direction -- + + +def test_field_best_directions_constants(): + """score is max-better, percentile_rank is min-better — uniform across kinds.""" + assert FIELD_BEST_DIRECTIONS["score"] == "max" + assert FIELD_BEST_DIRECTIONS["percentile_rank"] == "min" + + +def test_value_best_directions_kind_specific(): + """value direction is kind-dependent; affinity uses IC50 (min-better).""" + assert VALUE_BEST_DIRECTIONS[Kind.pMHC_affinity] == "min" + assert VALUE_BEST_DIRECTIONS[Kind.pMHC_stability] == "max" + + +def test_best_direction_field_defaults(): + # score is max for any kind that uses it + assert best_direction(Kind.pMHC_affinity, "score") == "max" + assert best_direction(Kind.pMHC_presentation, "score") == "max" + assert best_direction(Kind.proteasome_cleavage, "score") == "max" + # percentile_rank is min for any kind that uses it + assert best_direction(Kind.pMHC_affinity, "percentile_rank") == "min" + assert best_direction(Kind.pMHC_presentation, "percentile_rank") == "min" + + +def test_best_direction_value_kind_specific(): + assert best_direction(Kind.pMHC_affinity, "value") == "min" # IC50 + assert best_direction(Kind.pMHC_stability, "value") == "max" # half-life + + +def test_best_direction_value_unregistered_raises(): + """`value` for a kind without a registered direction must raise — + we refuse to guess the unit semantics.""" + with pytest.raises(ValueError, match="best_direction undefined"): + best_direction(Kind.pMHC_presentation, "value") + with pytest.raises(ValueError, match="best_direction undefined"): + best_direction(Kind.proteasome_cleavage, "value") + + +def test_best_direction_unknown_field_raises(): + with pytest.raises(ValueError, match="best_direction undefined for field"): + best_direction(Kind.pMHC_affinity, "made_up_field") + + +def test_best_direction_accepts_string_kind(): + # Kind constants are plain strings, so callers can pass either form. + assert best_direction("pMHC_affinity", "value") == "min" + assert best_direction("pMHC_affinity", "score") == "max" + + +# -- best_by (public, direction-aware) -- + + +def test_best_by_score_picks_max(): + ps = _make_pred_set() + best = ps.best_by_score(Kind.pMHC_affinity) + assert best.allele == "HLA-A*02:01" + assert best.score == 0.85 + + +def test_best_by_rank_picks_min(): + ps = _make_pred_set() + best = ps.best_by_rank(Kind.pMHC_affinity) + assert best.allele == "HLA-A*02:01" + assert best.percentile_rank == 0.8 + + +def test_best_by_value_affinity_picks_min_ic50(): + ps = _make_pred_set() + best = ps.best_by_value(Kind.pMHC_affinity) + assert best.allele == "HLA-A*02:01" + assert best.value == 120.5 + + +def test_best_by_value_stability_picks_max_half_life(): + ps = preds_from_rows( + [ + dict(kind=Kind.pMHC_stability, allele="HLA-A*02:01", + score=0.6, value=4.0), + dict(kind=Kind.pMHC_stability, allele="HLA-B*07:02", + score=0.9, value=12.0), + ], + peptide="SIINFEKL", + ) + best = ps.best_by_value(Kind.pMHC_stability) + assert best.allele == "HLA-B*07:02" + assert best.value == 12.0 + + +def test_best_by_value_unregistered_kind_raises(): + ps = _make_pred_set() + with pytest.raises(ValueError, match="best_direction undefined"): + ps.best_by_value(Kind.pMHC_presentation) + + +def test_best_by_skips_none_field(): + # antigen_processing pred has score but None for percentile_rank + ps = _make_pred_set() + assert ps.best_by_rank(Kind.antigen_processing) is None + assert ps.best_by_score(Kind.antigen_processing) is not None + + +def test_best_by_falls_back_to_allele_less(): + ps = preds_from_rows( + [dict(kind=Kind.proteasome_cleavage, score=0.7), + dict(kind=Kind.proteasome_cleavage, score=0.9)], + peptide="SIINFEKL", + ) + best = ps.best_by_score(Kind.proteasome_cleavage) + assert best is not None + assert best.score == 0.9 + assert best.allele == "" + + +def test_best_by_missing_kind_returns_none(): + ps = _make_pred_set() + assert ps.best_by_score(Kind.immunogenicity) is None + assert ps.best_by_rank(Kind.pMHC_stability) is None + + +def test_best_by_accepts_string_kind(): + # Kind constants are plain strings, so callers can pass either form. + ps = _make_pred_set() + by_kind = ps.best_by_score(Kind.pMHC_affinity) + by_str = ps.best_by_score("pMHC_affinity") + assert by_kind == by_str + assert ps.best_by("pMHC_affinity", "value") == ps.best_by_value(Kind.pMHC_affinity) + + +def test_best_by_rank_falls_back_to_allele_less(): + # Behavior change vs the old _best_by_rank: rank predictions without an + # allele are now considered when no allele-bearing rank pred exists. Pin + # the new contract so it can't silently regress. + ps = preds_from_rows( + [ + dict(kind=Kind.pMHC_presentation, score=0.5, percentile_rank=10.0), + dict(kind=Kind.pMHC_presentation, score=0.8, percentile_rank=2.0), + ], + peptide="SIINFEKL", + ) + best = ps.best_presentation_by_rank + assert best is not None + assert best.allele == "" + assert best.percentile_rank == 2.0