Skip to content

Commit ec315d9

Browse files
authored
Merge pull request #137 from evancofer/master
Remove chromosome/scaffold name mangling for bed/vcf files
2 parents dbef3f5 + 9a0aee6 commit ec315d9

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

selene_sdk/predict/_variant_effect_prediction.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ def read_vcf_file(input_path,
6262
"""
6363
variants = []
6464
na_rows = []
65+
check_chr = True
66+
for chrom in reference_sequence.get_chrs():
67+
if not chrom.startswith("chr"):
68+
check_chr = False
69+
break
6570
with open(input_path, 'r') as file_handle:
6671
lines = file_handle.readlines()
6772
index = 0
@@ -85,12 +90,15 @@ def read_vcf_file(input_path,
8590
chrom = str(cols[0])
8691
if 'CHR' == chrom[:3]:
8792
chrom = chrom.replace('CHR', 'chr')
88-
elif "chr" not in chrom:
93+
elif "chr" not in chrom and check_chr is True:
8994
chrom = "chr" + chrom
9095

9196
if chrom == "chrMT" and \
9297
chrom not in reference_sequence.get_chrs():
9398
chrom = "chrM"
99+
elif chrom == "MT" and \
100+
chrom not in reference_sequence.get_chrs():
101+
chrom = "M"
94102

95103
pos = int(cols[1])
96104
name = cols[2]

selene_sdk/predict/model_predict.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ def _get_sequences_from_bed_file(self,
298298
sequences = []
299299
labels = []
300300
na_rows = []
301+
check_chr = True
302+
for chrom in reference_sequence.get_chrs():
303+
if not chrom.startswith("chr"):
304+
check_chr = False
305+
break
301306
with open(input_path, 'r') as read_handle:
302307
for i, line in enumerate(read_handle):
303308
cols = line.strip().split('\t')
@@ -310,8 +315,8 @@ def _get_sequences_from_bed_file(self,
310315
strand = '.'
311316
if isinstance(strand_index, int) and len(cols) > strand_index:
312317
strand = cols[strand_index]
313-
if 'chr' not in chrom:
314-
chrom = 'chr{0}'.format(chrom)
318+
if 'chr' not in chrom and check_chr is True:
319+
chrom = "chr{0}".format(chrom)
315320
if not str.isdigit(start) or not str.isdigit(end) \
316321
or chrom not in self.reference_sequence.genome:
317322
na_rows.append(line)

0 commit comments

Comments
 (0)