1010# See the License for the specific language governing permissions and
1111# limitations under the License.
1212
13+ import json
14+ import logging
15+ import subprocess
16+ import sys
17+
1318from .proteasome_predictor import ProteasomePredictor
1419
1520# Module-level cache for loaded pepsickle models. Keyed by human_only.
1621_model_cache = {}
1722
23+ logger = logging .getLogger (__name__ )
24+
25+ PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS = 300
26+
27+ _PEPSICKLE_SUBPROCESS_SCRIPT = r"""
28+ import json
29+ import sys
30+
31+ from pepsickle.model_functions import (
32+ initialize_epitope_model,
33+ predict_protein_cleavage_locations,
34+ )
35+
36+ request = json.loads(sys.stdin.read())
37+ model = initialize_epitope_model(human_only=request["human_only"])
38+ results = {}
39+ for sequence in request["sequences"]:
40+ preds_raw = predict_protein_cleavage_locations(
41+ sequence,
42+ model,
43+ mod_type="epitope",
44+ proteasome_type="C",
45+ threshold=request["threshold"],
46+ )
47+ results[sequence] = [entry[2] for entry in preds_raw]
48+ json.dump({"results": results}, sys.stdout)
49+ """
50+
1851
1952class Pepsickle (ProteasomePredictor ):
2053 """
@@ -39,27 +72,41 @@ class Pepsickle(ProteasomePredictor):
3972
4073 human_only : bool
4174 If True, use human-only trained model instead of all-mammal.
75+
76+ isolate_subprocess : bool
77+ If True, run pepsickle inference in a short-lived Python subprocess.
78+ This avoids macOS duplicate OpenMP runtime crashes when the parent
79+ process has already imported packages such as pandas, numpy, or
80+ pyarrow.
81+
82+ subprocess_timeout : int
83+ Timeout in seconds for isolated pepsickle inference.
4284 """
4385
4486 def __init__ (
4587 self ,
4688 default_peptide_lengths = None ,
4789 scoring = None ,
4890 threshold = 0.5 ,
49- human_only = False ):
91+ human_only = False ,
92+ isolate_subprocess = False ,
93+ subprocess_timeout = PEPSICKLE_SUBPROCESS_TIMEOUT_SECONDS ):
5094 ProteasomePredictor .__init__ (
5195 self ,
5296 default_peptide_lengths = default_peptide_lengths ,
5397 scoring = scoring ,
5498 )
5599 self .threshold = threshold
56100 self .human_only = human_only
101+ self .isolate_subprocess = isolate_subprocess
102+ self .subprocess_timeout = subprocess_timeout
57103 self ._model = None
58104
59105 def __str__ (self ):
60- return "%s(scoring=%s)" % (
106+ return "%s(scoring=%s, isolate_subprocess=%s )" % (
61107 self .__class__ .__name__ ,
62- getattr (self .scoring , "__name__" , repr (self .scoring )))
108+ getattr (self .scoring , "__name__" , repr (self .scoring )),
109+ self .isolate_subprocess )
63110
64111 def _predictor_name (self ):
65112 return "pepsickle"
@@ -75,6 +122,20 @@ def _load_model(self):
75122 return self ._model
76123
77124 def cleavage_probs (self , sequence ):
125+ return self .cleavage_probs_many ([sequence ])[sequence ]
126+
127+ def cleavage_probs_many (self , sequences ):
128+ unique_sequences = list (dict .fromkeys (sequences ))
129+ if not unique_sequences :
130+ return {}
131+ if self .isolate_subprocess :
132+ return self ._cleavage_probs_many_subprocess (unique_sequences )
133+ return {
134+ sequence : self ._cleavage_probs_in_process (sequence )
135+ for sequence in unique_sequences
136+ }
137+
138+ def _cleavage_probs_in_process (self , sequence ):
78139 from pepsickle .model_functions import predict_protein_cleavage_locations
79140 model = self ._load_model ()
80141 preds_raw = predict_protein_cleavage_locations (
@@ -85,3 +146,78 @@ def cleavage_probs(self, sequence):
85146 threshold = self .threshold ,
86147 )
87148 return [entry [2 ] for entry in preds_raw ]
149+
150+ def _cleavage_probs_many_subprocess (self , sequences ):
151+ payload = json .dumps ({
152+ "human_only" : bool (self .human_only ),
153+ "threshold" : float (self .threshold ),
154+ "sequences" : sequences ,
155+ })
156+ try :
157+ result = subprocess .run (
158+ [sys .executable , "-c" , _PEPSICKLE_SUBPROCESS_SCRIPT ],
159+ input = payload ,
160+ text = True ,
161+ capture_output = True ,
162+ timeout = self .subprocess_timeout ,
163+ )
164+ except subprocess .TimeoutExpired as e :
165+ msg = (
166+ "pepsickle subprocess timed out after %d seconds "
167+ "while scoring %d sequences"
168+ % (self .subprocess_timeout , len (sequences )))
169+ logger .warning (msg )
170+ raise RuntimeError (msg ) from e
171+ except OSError as e :
172+ msg = "Could not start pepsickle subprocess: %s" % e
173+ logger .warning (msg )
174+ raise RuntimeError (msg ) from e
175+
176+ stderr_text = result .stderr .strip ()
177+ if stderr_text :
178+ logger .warning ("pepsickle subprocess stderr:\n %s" , stderr_text )
179+ if result .returncode != 0 :
180+ logger .warning (
181+ "pepsickle subprocess exited with code %d" ,
182+ result .returncode )
183+ raise RuntimeError (
184+ "pepsickle subprocess exited with code %d.\n stdout: %s\n "
185+ "stderr: %s"
186+ % (result .returncode , result .stdout .strip (), stderr_text ))
187+
188+ try :
189+ parsed = json .loads (result .stdout )
190+ except ValueError as e :
191+ logger .warning ("Could not parse pepsickle subprocess JSON output" )
192+ raise RuntimeError (
193+ "Could not parse pepsickle subprocess JSON output: %s"
194+ % result .stdout .strip ()) from e
195+
196+ results = parsed .get ("results" )
197+ if not isinstance (results , dict ):
198+ logger .warning (
199+ "pepsickle subprocess output is missing a results object" )
200+ raise RuntimeError (
201+ "pepsickle subprocess output is missing a results object" )
202+ missing = [sequence for sequence in sequences if sequence not in results ]
203+ if missing :
204+ logger .warning (
205+ "pepsickle subprocess omitted %d sequences" ,
206+ len (missing ))
207+ raise RuntimeError (
208+ "pepsickle subprocess omitted %d sequences" % len (missing ))
209+
210+ output = {}
211+ for sequence in sequences :
212+ probs = results [sequence ]
213+ if len (probs ) != len (sequence ):
214+ logger .warning (
215+ "pepsickle subprocess returned %d scores for a "
216+ "%d-residue sequence" ,
217+ len (probs ),
218+ len (sequence ))
219+ raise ValueError (
220+ "Expected %d pepsickle scores for sequence, got %d"
221+ % (len (sequence ), len (probs )))
222+ output [sequence ] = [float (p ) for p in probs ]
223+ return output
0 commit comments