Skip to content

Commit 7823207

Browse files
authored
Add pepsickle subprocess isolation
Adds opt-in subprocess isolation for Pepsickle, batches processing predictor cleavage calls, preserves generator peptide inputs, and bumps mhctools to 3.13.3.
1 parent 98b032f commit 7823207

5 files changed

Lines changed: 264 additions & 10 deletions

File tree

mhctools/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __getattr__(name):
6363
raise AttributeError(
6464
"module %r has no attribute %r" % (__name__, name))
6565

66-
__version__ = "3.13.2"
66+
__version__ = "3.13.3"
6767

6868
__all__ = [
6969
"Prediction",

mhctools/pepsickle.py

Lines changed: 139 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,44 @@
1010
# See the License for the specific language governing permissions and
1111
# limitations under the License.
1212

13+
import json
14+
import logging
15+
import subprocess
16+
import sys
17+
1318
from .proteasome_predictor import ProteasomePredictor
1419

1520
# Module-level cache for loaded pepsickle models. Keyed by human_only.
1621
_model_cache = {}
1722

23+
logger = logging.getLogger(__name__)
24+
25+
PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS = 300
26+
27+
_PEPSICKLE_SUBPROCESS_SCRIPT = r"""
28+
import json
29+
import sys
30+
31+
from pepsickle.model_functions import (
32+
initialize_epitope_model,
33+
predict_protein_cleavage_locations,
34+
)
35+
36+
request = json.loads(sys.stdin.read())
37+
model = initialize_epitope_model(human_only=request["human_only"])
38+
results = {}
39+
for sequence in request["sequences"]:
40+
preds_raw = predict_protein_cleavage_locations(
41+
sequence,
42+
model,
43+
mod_type="epitope",
44+
proteasome_type="C",
45+
threshold=request["threshold"],
46+
)
47+
results[sequence] = [entry[2] for entry in preds_raw]
48+
json.dump({"results": results}, sys.stdout)
49+
"""
50+
1851

1952
class Pepsickle(ProteasomePredictor):
2053
"""
@@ -39,27 +72,41 @@ class Pepsickle(ProteasomePredictor):
3972
4073
human_only : bool
4174
If True, use human-only trained model instead of all-mammal.
75+
76+
isolate_subprocess : bool
77+
If True, run pepsickle inference in a short-lived Python subprocess.
78+
This avoids macOS duplicate OpenMP runtime crashes when the parent
79+
process has already imported packages such as pandas, numpy, or
80+
pyarrow.
81+
82+
subprocess_timeout : int
83+
Timeout in seconds for isolated pepsickle inference.
4284
"""
4385

4486
def __init__(
4587
self,
4688
default_peptide_lengths=None,
4789
scoring=None,
4890
threshold=0.5,
49-
human_only=False):
91+
human_only=False,
92+
isolate_subprocess=False,
93+
subprocess_timeout=PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS):
5094
ProteasomePredictor.__init__(
5195
self,
5296
default_peptide_lengths=default_peptide_lengths,
5397
scoring=scoring,
5498
)
5599
self.threshold = threshold
56100
self.human_only = human_only
101+
self.isolate_subprocess = isolate_subprocess
102+
self.subprocess_timeout = subprocess_timeout
57103
self._model = None
58104

59105
def __str__(self):
60-
return "%s(scoring=%s)" % (
106+
return "%s(scoring=%s, isolate_subprocess=%s)" % (
61107
self.__class__.__name__,
62-
getattr(self.scoring, "__name__", repr(self.scoring)))
108+
getattr(self.scoring, "__name__", repr(self.scoring)),
109+
self.isolate_subprocess)
63110

64111
def _predictor_name(self):
65112
return "pepsickle"
@@ -75,6 +122,20 @@ def _load_model(self):
75122
return self._model
76123

77124
def cleavage_probs(self, sequence):
125+
return self.cleavage_probs_many([sequence])[sequence]
126+
127+
def cleavage_probs_many(self, sequences):
128+
unique_sequences = list(dict.fromkeys(sequences))
129+
if not unique_sequences:
130+
return {}
131+
if self.isolate_subprocess:
132+
return self._cleavage_probs_many_subprocess(unique_sequences)
133+
return {
134+
sequence: self._cleavage_probs_in_process(sequence)
135+
for sequence in unique_sequences
136+
}
137+
138+
def _cleavage_probs_in_process(self, sequence):
78139
from pepsickle.model_functions import predict_protein_cleavage_locations
79140
model = self._load_model()
80141
preds_raw = predict_protein_cleavage_locations(
@@ -85,3 +146,78 @@ def cleavage_probs(self, sequence):
85146
threshold=self.threshold,
86147
)
87148
return [entry[2] for entry in preds_raw]
149+
150+
def _cleavage_probs_many_subprocess(self, sequences):
151+
payload = json.dumps({
152+
"human_only": bool(self.human_only),
153+
"threshold": float(self.threshold),
154+
"sequences": sequences,
155+
})
156+
try:
157+
result = subprocess.run(
158+
[sys.executable, "-c", _PEPSICKLE_SUBPROCESS_SCRIPT],
159+
input=payload,
160+
text=True,
161+
capture_output=True,
162+
timeout=self.subprocess_timeout,
163+
)
164+
except subprocess.TimeoutExpired as e:
165+
msg = (
166+
"pepsickle subprocess timed out after %d seconds "
167+
"while scoring %d sequences"
168+
% (self.subprocess_timeout, len(sequences)))
169+
logger.warning(msg)
170+
raise RuntimeError(msg) from e
171+
except OSError as e:
172+
msg = "Could not start pepsickle subprocess: %s" % e
173+
logger.warning(msg)
174+
raise RuntimeError(msg) from e
175+
176+
stderr_text = result.stderr.strip()
177+
if stderr_text:
178+
logger.warning("pepsickle subprocess stderr:\n%s", stderr_text)
179+
if result.returncode != 0:
180+
logger.warning(
181+
"pepsickle subprocess exited with code %d",
182+
result.returncode)
183+
raise RuntimeError(
184+
"pepsickle subprocess exited with code %d.\nstdout: %s\n"
185+
"stderr: %s"
186+
% (result.returncode, result.stdout.strip(), stderr_text))
187+
188+
try:
189+
parsed = json.loads(result.stdout)
190+
except ValueError as e:
191+
logger.warning("Could not parse pepsickle subprocess JSON output")
192+
raise RuntimeError(
193+
"Could not parse pepsickle subprocess JSON output: %s"
194+
% result.stdout.strip()) from e
195+
196+
results = parsed.get("results")
197+
if not isinstance(results, dict):
198+
logger.warning(
199+
"pepsickle subprocess output is missing a results object")
200+
raise RuntimeError(
201+
"pepsickle subprocess output is missing a results object")
202+
missing = [sequence for sequence in sequences if sequence not in results]
203+
if missing:
204+
logger.warning(
205+
"pepsickle subprocess omitted %d sequences",
206+
len(missing))
207+
raise RuntimeError(
208+
"pepsickle subprocess omitted %d sequences" % len(missing))
209+
210+
output = {}
211+
for sequence in sequences:
212+
probs = results[sequence]
213+
if len(probs) != len(sequence):
214+
logger.warning(
215+
"pepsickle subprocess returned %d scores for a "
216+
"%d-residue sequence",
217+
len(probs),
218+
len(sequence))
219+
raise ValueError(
220+
"Expected %d pepsickle scores for sequence, got %d"
221+
% (len(sequence), len(probs)))
222+
output[sequence] = [float(p) for p in probs]
223+
return output

mhctools/processing_predictor.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,18 @@ def cleavage_probs(self, sequence):
199199
raise NotImplementedError(
200200
"%s must implement cleavage_probs" % self.__class__.__name__)
201201

202+
def cleavage_probs_many(self, sequences):
203+
"""
204+
Return per-position cleavage probabilities for multiple sequences.
205+
206+
Subclasses can override this to batch expensive setup. The default
207+
implementation preserves the existing one-sequence-at-a-time behavior.
208+
"""
209+
return {
210+
sequence: self.cleavage_probs(sequence)
211+
for sequence in dict.fromkeys(sequences)
212+
}
213+
202214
# ------------------------------------------------------------------
203215
# Component helpers
204216
# ------------------------------------------------------------------
@@ -273,13 +285,18 @@ def predict(self, peptides, n_flanks=None, c_flanks=None):
273285
-------
274286
list of PeptideResult
275287
"""
276-
results = []
288+
prediction_inputs = []
277289
for i, peptide in enumerate(peptides):
278290
n_flank = n_flanks[i] if n_flanks else ""
279291
c_flank = c_flanks[i] if c_flanks else ""
280-
281292
full_seq = n_flank + peptide + c_flank
282-
probs = self.cleavage_probs(full_seq)
293+
prediction_inputs.append((peptide, n_flank, c_flank, full_seq))
294+
full_sequences = [item[3] for item in prediction_inputs]
295+
probs_by_sequence = self.cleavage_probs_many(full_sequences)
296+
297+
results = []
298+
for peptide, n_flank, c_flank, full_seq in prediction_inputs:
299+
probs = probs_by_sequence[full_seq]
283300

284301
offset = len(n_flank)
285302
score = self._peptide_score(probs, offset, len(peptide))
@@ -336,10 +353,11 @@ def predict_proteins(self, sequence_dict, peptide_lengths=None,
336353
sequence_dict = {seq: seq for seq in sequence_dict}
337354

338355
peptide_lengths = self._resolve_peptide_lengths(peptide_lengths)
356+
probs_by_sequence = self.cleavage_probs_many(sequence_dict.values())
339357

340358
results = defaultdict(list)
341359
for name, sequence in sequence_dict.items():
342-
probs = self.cleavage_probs(sequence)
360+
probs = probs_by_sequence[sequence]
343361
for plen in peptide_lengths:
344362
for i in range(len(sequence) - plen + 1):
345363
peptide = sequence[i:i + plen]
@@ -388,8 +406,9 @@ def predict_cleavage_sites(self, sequence_dict):
388406
"""
389407
if isinstance(sequence_dict, str):
390408
sequence_dict = {"seq": sequence_dict}
409+
probs_by_sequence = self.cleavage_probs_many(sequence_dict.values())
391410
return {
392-
name: self.cleavage_probs(seq)
411+
name: probs_by_sequence[seq]
393412
for name, seq in sequence_dict.items()
394413
}
395414

tests/test_pepsickle.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
# See the License for the specific language governing permissions and
1111
# limitations under the License.
1212

13+
import json
14+
import subprocess
15+
1316
import pytest
1417

1518
from mhctools.pepsickle import Pepsickle
@@ -39,6 +42,7 @@ def test_init_defaults():
3942
p = Pepsickle()
4043
assert p.threshold == 0.5
4144
assert p.human_only is False
45+
assert p.isolate_subprocess is False
4246
assert p.default_peptide_lengths == [9]
4347
assert p.scoring is score_cterm_anti_max_internal
4448
assert not hasattr(p, "alleles")
@@ -71,6 +75,57 @@ def test_cleavage_probs(predictor):
7175
assert all(0.0 <= p <= 1.0 for p in probs)
7276

7377

78+
def test_isolated_cleavage_probs_matches_in_process():
79+
in_process = Pepsickle().cleavage_probs(PROTEIN)
80+
isolated = Pepsickle(isolate_subprocess=True).cleavage_probs(PROTEIN)
81+
assert isolated == pytest.approx(in_process)
82+
83+
84+
def test_isolated_cleavage_probs_batches_unique_sequences(monkeypatch):
85+
calls = []
86+
87+
def fake_run(args, input, text, capture_output, timeout):
88+
calls.append({
89+
"args": args,
90+
"request": json.loads(input),
91+
"text": text,
92+
"capture_output": capture_output,
93+
"timeout": timeout,
94+
})
95+
results = {
96+
sequence: [0.25] * len(sequence)
97+
for sequence in calls[-1]["request"]["sequences"]
98+
}
99+
return subprocess.CompletedProcess(
100+
args=args,
101+
returncode=0,
102+
stdout=json.dumps({"results": results}),
103+
stderr="",
104+
)
105+
106+
monkeypatch.setattr("mhctools.pepsickle.subprocess.run", fake_run)
107+
108+
predictor = Pepsickle(
109+
threshold=0.25,
110+
human_only=True,
111+
isolate_subprocess=True,
112+
subprocess_timeout=7,
113+
)
114+
result = predictor.cleavage_probs_many([PROTEIN, PROTEIN, "AC"])
115+
116+
assert result[PROTEIN] == [0.25] * len(PROTEIN)
117+
assert result["AC"] == [0.25, 0.25]
118+
assert len(calls) == 1
119+
assert calls[0]["request"] == {
120+
"human_only": True,
121+
"threshold": 0.25,
122+
"sequences": [PROTEIN, "AC"],
123+
}
124+
assert calls[0]["text"] is True
125+
assert calls[0]["capture_output"] is True
126+
assert calls[0]["timeout"] == 7
127+
128+
74129
# -- predict --
75130

76131
def test_predict_returns_peptide_preds(predictor):
@@ -191,4 +246,3 @@ def test_scoring_methods_produce_different_scores():
191246
tuple(round(s, 6) for s in v) for v in scores_by_fn.values())
192247
assert len(unique) > 1
193248

194-

0 commit comments

Comments
 (0)