Skip to content

Commit f2f65d7

Browse files
committed
Adding splice junction exon annotation information
1 parent af38ace commit f2f65d7

1 file changed

Lines changed: 151 additions & 4 deletions

File tree

workflow/scripts/leafcutter_annotation.py

Lines changed: 151 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,107 @@ def parse_intron(line_list, intron_idx):
346346
return (chrom, start, stop, cluster_id)
347347

348348

349+
def index_annotated_splice_junctions(
350+
file,
351+
single_value_features=["transcript_name", "exon_strand"],
352+
multi_value_features=["exon_id", "exon_number", "exon_start_end"],
353+
multi_value_delim="|"
354+
):
355+
"""Indexes the annotated splice junctions from the
356+
splicing annotation file passed via the --splicing-ann
357+
command line argument. This function returns a dictionary
358+
containing exon-exon pair information for a given
359+
transcript for a given splice junction.
360+
Where:
361+
SpliceJunction
362+
| |
363+
transcriptX ---#########------------######-------
364+
exonA exonB
365+
@param file <str>:
366+
Splicing annotation file containing exon
367+
information for each transcript.
368+
@param single_value_features <list[str]>:
369+
List of features that are single valued
370+
for each splice junction. These features
371+
will be stored as a single value in the
372+
output dictionary.
373+
@param multi_value_features <list[str]>:
374+
List of features that are multi valued
375+
for each splice junction. These features
376+
reprsent surrounding exon information
377+
and will be stored as a single value
378+
in the output dictionary, where each
379+
value is delimited by multi_value_delim.
380+
@param multi_value_delim <str>:
381+
Delimiter used to separate multiple values
382+
for multi valued features in the output
383+
dictionary. Default is "|".
384+
@return <dict[str]=dict[str]>:
385+
Returns a dictionary where:
386+
key = "{transcript}:{exonA_end}:{exonB_start}"
387+
value = Dictionary containing additional
388+
information about the exons surrounding
389+
the splice junction. This dictionary has
390+
the following keys:
391+
• transcript_name: The name of the
392+
transcript the splice junction is
393+
associated with.
394+
• exon_strand: Strand the exon is on.
395+
• exon_id: The identifiers of the
396+
surrounding exons. Information for
397+
ExonA and ExonB are delimited by
398+
a pipe "|" character.
399+
• exon_number: The number of the
400+
surrounding exons. Information for
401+
ExonA and ExonB are delimited by
402+
a pipe "|" character.
403+
• exon_start_end: Start and end position
404+
of the surrounding exons. Information
405+
for ExonA and ExonB are delimited by
406+
a pipe "|" character.
407+
"""
408+
log("Started indexing input file: ", file)
409+
file_idx = {}
410+
# Handler for opening files, i.e.
411+
# uncompressed or gzip files
412+
open_func = gzip.open if file.endswith('.gz') else open
413+
line_number = 0 # Used for error reporting
414+
with open_func(file, 'rt') as fh:
415+
header = next(fh)
416+
col_idx = index_header(header)
417+
for line in fh:
418+
# Split the line into columns
419+
tokens = line.strip().split('\t')
420+
# Parse the splice junction information
421+
transcript = tokens[col_idx["transcript_id"]]
422+
nexons = len(tokens[col_idx["exon_id"]].split(multi_value_delim))
423+
for i in range(0, nexons-1):
424+
exonA_end = tokens[col_idx["exon_start_end"]].split(
425+
multi_value_delim
426+
)[i].split(":")[1]
427+
exonB_start = tokens[col_idx["exon_start_end"]].split(
428+
multi_value_delim
429+
)[i+1].split(":")[0]
430+
# Create the key for the splice junction
431+
_k = "{0}:{1}:{2}".format(
432+
transcript, exonA_end, exonB_start
433+
)
434+
if _k not in file_idx:
435+
# Initialize the dictionary for this key
436+
file_idx[_k] = {}
437+
# Parse single and multi valued features
438+
for feature in single_value_features:
439+
file_idx[_k][feature] = tokens[col_idx[feature]]
440+
for feature in multi_value_features:
441+
# Join multi valued features with the
442+
# multi_value_delim character
443+
file_idx[_k][feature] = multi_value_delim.join(
444+
tokens[col_idx[feature]].split(multi_value_delim)[i:i+2]
445+
)
446+
log("Completed indexing input file: ", file)
447+
return file_idx
448+
449+
349450
def get_additional_annotation_information(annotation_dict, first_key, values):
350451
"""Get additional annotation information from a nested
351452
dictionary using the first_key and each value in values
@@ -407,6 +508,8 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
407508
# annotation file where:
408509
# key = {chr}:{intron_start}:{intron_end}:{clust_id}
409510
PARSE_INTRON_ANN = ["gene","ensemblID","verdict","transcripts"]
511+
INTRON_TRANSCRIPT_IDX = PARSE_INTRON_ANN.index("transcripts")
512+
INTRON_VERDICT_IDX = PARSE_INTRON_ANN.index("verdict")
410513
intron_ann_dict = index_file(
411514
args.intron_ann,
412515
keys=["chr","start","end","clusterID"],
@@ -417,13 +520,21 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
417520
# Parse exon information from the
418521
# splicing annotation file where:
419522
# key = {transcript}:{exonA_end}:{exonB_start}
523+
SINGULAR_EXON_FEATURES = ["transcript_name", "exon_strand"]
524+
SURRONDING_EXON_FEATURES = ["exon_id", "exon_number", "exon_start_end"]
525+
splicing_ann_dict = index_annotated_splice_junctions(
526+
args.splicing_ann,
527+
single_value_features=SINGULAR_EXON_FEATURES,
528+
multi_value_features=SURRONDING_EXON_FEATURES,
529+
multi_value_delim="|"
530+
)
420531

421532
# Loop through effect sizes file
422533
# and add more detailed information
423534
log("Writing annotated output file: ", args.output)
424535
ofh = open(args.output, "w")
425536
with open(args.effect_sizes, "r") as ifh:
426-
input_header = next(ifh).rstrip().split("\t") + PARSE_CLUSTER_SIGNIF + PARSE_INTRON_ANN
537+
input_header = next(ifh).rstrip().split("\t") + PARSE_CLUSTER_SIGNIF + PARSE_INTRON_ANN + SINGULAR_EXON_FEATURES + SURRONDING_EXON_FEATURES
427538
output_header = "\t".join(input_header)
428539
intron_idx = input_header.index("intron")
429540
ofh.write(output_header + "\n")
@@ -459,10 +570,46 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
459570
":".join(intron),
460571
PARSE_INTRON_ANN
461572
)
573+
# Get exon information for the
574+
# splice junction, where:
575+
# key = "transcript:exonA_end:exonB_start"
576+
# i.e "{transcript}:{intron_start}:{intron_end}"
577+
# and values are the features
578+
# in SINGULAR_EXON_FEATURES and
579+
# SURRONDING_EXON_FEATURES
580+
_exon_splice_junction_keys = [
581+
"{0}:{1}:{2}".format(t,intron_start,intron_stop) \
582+
for t in _intron_ann_values[INTRON_TRANSCRIPT_IDX].split("+")
583+
]
584+
# Aggregate 1:M features across for
585+
# splice junctions effecting more than
586+
# one transcript
587+
_exon_features_dict = {}
588+
# Loop through each splice junction
589+
for sj in _exon_splice_junction_keys:
590+
for feature in SINGULAR_EXON_FEATURES + SURRONDING_EXON_FEATURES:
591+
if feature not in _exon_features_dict:
592+
_exon_features_dict[feature] = []
593+
_exon_features_dict[feature].append(
594+
splicing_ann_dict.get(sj, {}).get(feature, "NA")
595+
)
596+
# Collapse exon-exon strand information,
597+
# it will be on the same strand for a
598+
# given transcript splice junction
599+
_exon_features_dict["exon_strand"] = ["|".join(_exon_features_dict["exon_strand"])]
600+
# Aggregate the exon information
601+
_exon_ann_values = []
602+
for feature in SINGULAR_EXON_FEATURES + SURRONDING_EXON_FEATURES:
603+
# Join multi valued features with the
604+
# multi_value_delim character
605+
_exon_ann_values.append(
606+
"+".join(_exon_features_dict[feature])
607+
)
462608
# Write annotated line to output
463-
_output_line = "{0}\t{1}\t{2}".format(
609+
_output_line = "{0}\t{1}\t{2}\t{3}".format(
464610
"\t".join(tokens),
465611
"\t".join(_cluster_signif_values),
466-
"\t".join(_intron_ann_values)
612+
"\t".join(_intron_ann_values),
613+
"\t".join(_exon_ann_values)
467614
)
468-
ofh.write(_output_line + "\n")
615+
ofh.write(_output_line + "\n")

0 commit comments

Comments
 (0)