@@ -346,6 +346,107 @@ def parse_intron(line_list, intron_idx):
346346 return (chrom , start , stop , cluster_id )
347347
348348
349+ def index_annotated_splice_junctions (
350+ file ,
351+ single_value_features = ["transcript_name" , "exon_strand" ],
352+ multi_value_features = ["exon_id" , "exon_number" , "exon_start_end" ],
353+ multi_value_delim = "|"
354+ ):
355+ """Indexes the annotated splice junctions from the
356+ splicing annotation file passed via the --splicing-ann
357+ command line argument. This function returns a dictionary
358+ containing exon-exon pair information for a given
359+ transcript for a given splice junction.
360+ Where:
361+ SpliceJunction
362+ | |
363+ transcriptX ---#########------------######-------
364+ exonA exonB
365+ @param file <str>:
366+ Splicing annotation file containing exon
367+ information for each transcript.
368+ @param single_value_features <list[str]>:
369+ List of features that are single valued
370+ for each splice junction. These features
371+ will be stored as a single value in the
372+ output dictionary.
373+ @param multi_value_features <list[str]>:
374+ List of features that are multi valued
375+ for each splice junction. These features
376+ reprsent surrounding exon information
377+ and will be stored as a single value
378+ in the output dictionary, where each
379+ value is delimited by multi_value_delim.
380+ @param multi_value_delim <str>:
381+ Delimiter used to separate multiple values
382+ for multi valued features in the output
383+ dictionary. Default is "|".
384+ @return <dict[str]=dict[str]>:
385+ Returns a dictionary where:
386+ key = "{transcript}:{exonA_end}:{exonB_start}"
387+ value = Dictionary containing additional
388+ information about the exons surrounding
389+ the splice junction. This dictionary has
390+ the following keys:
391+ • transcript_name: The name of the
392+ transcript the splice junction is
393+ associated with.
394+ • exon_strand: Strand the exon is on.
395+ • exon_id: The identifiers of the
396+ surrounding exons. Information for
397+ ExonA and ExonB are delimited by
398+ a pipe "|" character.
399+ • exon_number: The number of the
400+ surrounding exons. Information for
401+ ExonA and ExonB are delimited by
402+ a pipe "|" character.
403+ • exon_start_end: Start and end position
404+ of the surrounding exons. Information
405+ for ExonA and ExonB are delimited by
406+ a pipe "|" character.
407+ """
408+ log ("Started indexing input file: " , file )
409+ file_idx = {}
410+ # Handler for opening files, i.e.
411+ # uncompressed or gzip files
412+ open_func = gzip .open if file .endswith ('.gz' ) else open
413+ line_number = 0 # Used for error reporting
414+ with open_func (file , 'rt' ) as fh :
415+ header = next (fh )
416+ col_idx = index_header (header )
417+ for line in fh :
418+ # Split the line into columns
419+ tokens = line .strip ().split ('\t ' )
420+ # Parse the splice junction information
421+ transcript = tokens [col_idx ["transcript_id" ]]
422+ nexons = len (tokens [col_idx ["exon_id" ]].split (multi_value_delim ))
423+ for i in range (0 , nexons - 1 ):
424+ exonA_end = tokens [col_idx ["exon_start_end" ]].split (
425+ multi_value_delim
426+ )[i ].split (":" )[1 ]
427+ exonB_start = tokens [col_idx ["exon_start_end" ]].split (
428+ multi_value_delim
429+ )[i + 1 ].split (":" )[0 ]
430+ # Create the key for the splice junction
431+ _k = "{0}:{1}:{2}" .format (
432+ transcript , exonA_end , exonB_start
433+ )
434+ if _k not in file_idx :
435+ # Initialize the dictionary for this key
436+ file_idx [_k ] = {}
437+ # Parse single and multi valued features
438+ for feature in single_value_features :
439+ file_idx [_k ][feature ] = tokens [col_idx [feature ]]
440+ for feature in multi_value_features :
441+ # Join multi valued features with the
442+ # multi_value_delim character
443+ file_idx [_k ][feature ] = multi_value_delim .join (
444+ tokens [col_idx [feature ]].split (multi_value_delim )[i :i + 2 ]
445+ )
446+ log ("Completed indexing input file: " , file )
447+ return file_idx
448+
449+
349450def get_additional_annotation_information (annotation_dict , first_key , values ):
350451 """Get additional annotation information from a nested
351452 dictionary using the first_key and each value in values
@@ -407,6 +508,8 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
407508 # annotation file where:
408509 # key = {chr}:{intron_start}:{intron_end}:{clust_id}
409510 PARSE_INTRON_ANN = ["gene" ,"ensemblID" ,"verdict" ,"transcripts" ]
511+ INTRON_TRANSCRIPT_IDX = PARSE_INTRON_ANN .index ("transcripts" )
512+ INTRON_VERDICT_IDX = PARSE_INTRON_ANN .index ("verdict" )
410513 intron_ann_dict = index_file (
411514 args .intron_ann ,
412515 keys = ["chr" ,"start" ,"end" ,"clusterID" ],
@@ -417,13 +520,21 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
417520 # Parse exon information from the
418521 # splicing annotation file where:
419522 # key = {transcript}:{exonA_end}:{exonB_start}
523+ SINGULAR_EXON_FEATURES = ["transcript_name" , "exon_strand" ]
524+ SURRONDING_EXON_FEATURES = ["exon_id" , "exon_number" , "exon_start_end" ]
525+ splicing_ann_dict = index_annotated_splice_junctions (
526+ args .splicing_ann ,
527+ single_value_features = SINGULAR_EXON_FEATURES ,
528+ multi_value_features = SURRONDING_EXON_FEATURES ,
529+ multi_value_delim = "|"
530+ )
420531
421532 # Loop through effect sizes file
422533 # and add more detailed information
423534 log ("Writing annotated output file: " , args .output )
424535 ofh = open (args .output , "w" )
425536 with open (args .effect_sizes , "r" ) as ifh :
426- input_header = next (ifh ).rstrip ().split ("\t " ) + PARSE_CLUSTER_SIGNIF + PARSE_INTRON_ANN
537+ input_header = next (ifh ).rstrip ().split ("\t " ) + PARSE_CLUSTER_SIGNIF + PARSE_INTRON_ANN + SINGULAR_EXON_FEATURES + SURRONDING_EXON_FEATURES
427538 output_header = "\t " .join (input_header )
428539 intron_idx = input_header .index ("intron" )
429540 ofh .write (output_header + "\n " )
@@ -459,10 +570,46 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
459570 ":" .join (intron ),
460571 PARSE_INTRON_ANN
461572 )
573+ # Get exon information for the
574+ # splice junction, where:
575+ # key = "transcript:exonA_end:exonB_start"
576+ # i.e "{transcript}:{intron_start}:{intron_end}"
577+ # and values are the features
578+ # in SINGULAR_EXON_FEATURES and
579+ # SURRONDING_EXON_FEATURES
580+ _exon_splice_junction_keys = [
581+ "{0}:{1}:{2}" .format (t ,intron_start ,intron_stop ) \
582+ for t in _intron_ann_values [INTRON_TRANSCRIPT_IDX ].split ("+" )
583+ ]
584+ # Aggregate 1:M features across for
585+ # splice junctions effecting more than
586+ # one transcript
587+ _exon_features_dict = {}
588+ # Loop through each splice junction
589+ for sj in _exon_splice_junction_keys :
590+ for feature in SINGULAR_EXON_FEATURES + SURRONDING_EXON_FEATURES :
591+ if feature not in _exon_features_dict :
592+ _exon_features_dict [feature ] = []
593+ _exon_features_dict [feature ].append (
594+ splicing_ann_dict .get (sj , {}).get (feature , "NA" )
595+ )
596+ # Collapse exon-exon strand information,
597+ # it will be on the same strand for a
598+ # given transcript splice junction
599+ _exon_features_dict ["exon_strand" ] = ["|" .join (_exon_features_dict ["exon_strand" ])]
600+ # Aggregate the exon information
601+ _exon_ann_values = []
602+ for feature in SINGULAR_EXON_FEATURES + SURRONDING_EXON_FEATURES :
603+ # Join multi valued features with the
604+ # multi_value_delim character
605+ _exon_ann_values .append (
606+ "+" .join (_exon_features_dict [feature ])
607+ )
462608 # Write annotated line to output
463- _output_line = "{0}\t {1}\t {2}" .format (
609+ _output_line = "{0}\t {1}\t {2}\t {3} " .format (
464610 "\t " .join (tokens ),
465611 "\t " .join (_cluster_signif_values ),
466- "\t " .join (_intron_ann_values )
612+ "\t " .join (_intron_ann_values ),
613+ "\t " .join (_exon_ann_values )
467614 )
468- ofh .write (_output_line + "\n " )
615+ ofh .write (_output_line + "\n " )
0 commit comments