Skip to content

Commit d160650

Browse files
Parse all predictor outputs from all chunks together
1 parent 6a8c7aa commit d160650

89 files changed

Lines changed: 3417 additions & 3346 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

pvactools/lib/fasta_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ def execute(self):
434434
continue
435435
fasta_sequences.setdefault(sequence, []).append(seq_id)
436436

437-
output_file = "{}.{}.tsv".format(self.output_file_prefix, length)
437+
output_file = "{}.{}.fa".format(self.output_file_prefix, length)
438438
self.output_files.append(output_file)
439439
output_key_file = "{}.key".format(output_file)
440440
writer = open(output_file, 'w')

pvactools/lib/output_parser.py

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,16 @@ class OutputParser(metaclass=ABCMeta):
4949
def __init__(self, **kwargs):
5050
self.input_iedb_files = kwargs['input_iedb_files']
5151
self.input_tsv_file = kwargs['input_tsv_file']
52-
self.key_file = kwargs['key_file']
52+
self.key_files = kwargs['key_files']
5353
self.output_file = kwargs['output_file']
5454
self.sample_name = kwargs['sample_name']
5555
self.add_sample_name = kwargs.get('add_sample_name_column')
5656
self.flurry_state = kwargs.get('flurry_state')
5757
self.use_normalized_percentiles = kwargs.get('use_normalized_percentiles', False)
5858
self.reference_scores_path = kwargs.get('reference_scores_path', None)
5959
self.reference_scores = {}
60+
self.pipeline_type = kwargs.get('pipeline_type', None)
61+
self.input_file_type = kwargs.get('input_file_type')
6062

6163
def parse_input_tsv_file(self):
6264
with open(self.input_tsv_file, 'r') as reader:
@@ -1090,12 +1092,16 @@ def execute(self):
10901092
class DefaultOutputParser(OutputParser):
10911093

10921094
def parse_iedb_file(self, tsv_entries):
1093-
with open(self.key_file, 'r') as key_file_reader:
1094-
protein_identifiers_from_label = yaml.load(key_file_reader, Loader=yaml.FullLoader)
1095+
protein_identifiers_from_label = {}
1096+
for key_file in self.key_files:
1097+
with open(key_file, 'r') as key_file_reader:
1098+
chunk = key_file.rsplit('.', 2)[1].split('_')[1]
1099+
protein_identifiers_from_label[chunk] = yaml.load(key_file_reader, Loader=yaml.FullLoader)
10951100
iedb_results = {}
10961101
wt_iedb_results = {}
10971102
for input_iedb_file in self.input_iedb_files:
10981103
with open(input_iedb_file, 'r') as reader:
1104+
chunk = input_iedb_file.rsplit('_', 1)[1]
10991105
iedb_tsv_reader = csv.DictReader(reader, delimiter='\t')
11001106
filename = os.path.basename(input_iedb_file)
11011107

@@ -1118,8 +1124,8 @@ def parse_iedb_file(self, tsv_entries):
11181124
allele = line['allele']
11191125
peptide_length = len(epitope)
11201126

1121-
if protein_identifiers_from_label[protein_label] is not None:
1122-
protein_identifiers = protein_identifiers_from_label[protein_label]
1127+
if protein_identifiers_from_label[chunk][protein_label] is not None:
1128+
protein_identifiers = protein_identifiers_from_label[chunk][protein_label]
11231129

11241130
for protein_identifier in protein_identifiers:
11251131
(protein_type, tsv_index) = protein_identifier.split('.', 1)
@@ -1152,11 +1158,21 @@ def parse_iedb_file(self, tsv_entries):
11521158

11531159
class UnmatchedSequencesOutputParser(OutputParser):
11541160
def parse_iedb_file(self):
1155-
with open(self.key_file, 'r') as key_file_reader:
1156-
tsv_indices_from_label = yaml.load(key_file_reader, Loader=yaml.FullLoader)
1161+
protein_identifiers_from_label = {}
1162+
for key_file in self.key_files:
1163+
with open(key_file, 'r') as key_file_reader:
1164+
if self.input_file_type == 'pvacvector_input_fasta':
1165+
chunk = 1
1166+
else:
1167+
chunk = key_file.rsplit('.', 2)[1].split('_')[1]
1168+
protein_identifiers_from_label[chunk] = yaml.load(key_file_reader, Loader=yaml.FullLoader)
11571169
iedb_results = {}
11581170
for input_iedb_file in self.input_iedb_files:
11591171
with open(input_iedb_file, 'r') as reader:
1172+
if self.input_file_type == 'pvacvector_input_fasta':
1173+
chunk = 1
1174+
else:
1175+
chunk = input_iedb_file.rsplit('_', 1)[1]
11601176
iedb_tsv_reader = csv.DictReader(reader, delimiter='\t')
11611177
filename = os.path.basename(input_iedb_file)
11621178

@@ -1179,8 +1195,8 @@ def parse_iedb_file(self):
11791195
allele = line['allele']
11801196
peptide_length = len(epitope)
11811197

1182-
if tsv_indices_from_label[protein_label] is not None:
1183-
tsv_indices = tsv_indices_from_label[protein_label]
1198+
if protein_identifiers_from_label[chunk][protein_label] is not None:
1199+
tsv_indices = protein_identifiers_from_label[chunk][protein_label]
11841200

11851201
for index in tsv_indices:
11861202
key = '|'.join([index, position])
@@ -1353,13 +1369,17 @@ def execute(self):
13531369
class PvacspliceOutputParser(UnmatchedSequencesOutputParser):
13541370
def parse_iedb_file(self):
13551371
# input key file
1356-
with open(self.key_file, 'r') as key_file_reader:
1357-
protein_identifiers_from_label = yaml.load(key_file_reader, Loader=yaml.FullLoader)
1372+
protein_identifiers_from_label = {}
1373+
for key_file in self.key_files:
1374+
with open(key_file, 'r') as key_file_reader:
1375+
chunk = key_file.rsplit('.', 2)[1].split('_')[1]
1376+
protein_identifiers_from_label[chunk] = yaml.load(key_file_reader, Loader=yaml.FullLoader)
13581377
# final output
13591378
iedb_results = {}
13601379
for input_iedb_file in self.input_iedb_files:
13611380
# input iedb file
13621381
with open(input_iedb_file, 'r') as reader:
1382+
chunk = input_iedb_file.rsplit('_', 1)[1]
13631383
iedb_tsv_reader = csv.DictReader(reader, delimiter='\t')
13641384
filename = os.path.basename(input_iedb_file)
13651385
pattern = re.compile(rf"{re.escape(self.sample_name)}\.(\w+(?:-\d+\.\d+)?)")
@@ -1376,9 +1396,9 @@ def parse_iedb_file(self):
13761396
peptide_length = len(epitope)
13771397
scores = self.get_scores(line, method)
13781398
# get fasta_id/combined_name from fasta key file
1379-
if protein_identifiers_from_label[fasta_label] is not None:
1399+
if protein_identifiers_from_label[chunk][fasta_label] is not None:
13801400
# comma-separated string (1 or more ids) as 1 entry in list
1381-
protein_label = protein_identifiers_from_label[fasta_label][0]
1401+
protein_label = protein_identifiers_from_label[chunk][fasta_label][0]
13821402
# one index at a time
13831403
for key in protein_label.split(','):
13841404

0 commit comments

Comments
 (0)