1616 [--fdr-filter FDR_FILTER] \\
1717 [--fasta-delim FASTA_DELIM] \\
1818 --isoform-switch-results ISOFORM_SWITCH_RESULTS \\
19- --splicing-ann SPLICIN_ANN_FILE \\
19+ --splicing-ann SPLICING_ANN_FILE \\
2020 --transcripts-fasta TRANSCRIPTS_FASTA \\
2121 --output OUTPUT_FILE
2222@About:
4444 • isoform_id
4545 • gene_id
4646 • isoform_switch_q_value
47- -s, --splicing-ann SPLICIN_ANN_FILE
47+ -s, --splicing-ann SPLICING_ANN_FILE
4848 Splicing annotation file. This file should
4949 be a tab-separated file containing the
5050 following columns:
@@ -207,15 +207,15 @@ def parse_cli_arguments():
207207 formatter_class = argparse .RawDescriptionHelpFormatter ,
208208 usage = argparse .SUPPRESS ,
209209 )
210- # Isoform switch results file
210+ # Input isoform switch results file
211211 parser .add_argument (
212212 '-i' , '--isoform-switch-results' ,
213213 type = lambda file : \
214214 check_permissions (parser , file , os .R_OK ),
215215 required = True ,
216216 help = argparse .SUPPRESS
217217 )
218- # Splicing annotation file
218+ # Input splicing annotation file
219219 parser .add_argument (
220220 '-s' , '--splicing-ann' ,
221221 type = lambda file : \
@@ -302,15 +302,215 @@ def index_header(file_header):
302302 return idx
303303
304304
305+ def parsed_isa_switching_genes (
306+ file ,
307+ geneid_column = "gene_id" ,
308+ isoformid_column = "isoform_id" ,
309+ fdr_column = "isoform_switch_q_value" ,
310+ fdr_threshold = 0.1
311+ ):
312+ """Parses an IsoformSwitchAnalyzeR results file
313+ and returns a dictionary of isoform switches
314+ with significant FDR values. The returned dict
315+ contains gene IDs as keys and a list of isoform
316+ IDs that have significant isoform switches as values.
317+ The FDR threshold is used to identify genes/isoforms
318+ with significant isoform switches.
319+ @param file <str>:
320+ Path to the IsoformSwitchAnalyzeR results file.
321+ @param geneid_column <str>:
322+ Column name for gene IDs in the results file.
323+ @param isoformid_column <str>:
324+ Column name for isoform IDs in the results file.
325+ @param fdr_column <str>:
326+ Column name for FDR values in the results file.
327+ @param fdr_threshold <float>:
328+ FDR threshold to use for filtering isoform switches
329+ @return isoform_switches <dict[str]=list[str]>:
330+ Dictionary of genes with significant isoform switches,
331+ where keys are gene IDs and values are lists of
332+ isoform IDs that have significant isoform switches.
333+ """
334+ log ("Starting to parse ISA results file: {0}" .format (file ))
335+ isoform_switches = {}
336+ # Handler for opening files, i.e.
337+ # uncompressed or gzip files
338+ open_func = gzip .open if file .endswith ('.gz' ) else open
339+ line_number = 0 # Use for error reporting
340+ with open_func (file , 'rt' ) as fh :
341+ header = next (fh )
342+ col_idx = index_header (header )
343+ for line in fh :
344+ # Increment line number
345+ line_number += 1
346+ # Split the line into columns
347+ tokens = line .strip ().split ('\t ' )
348+ switch_fdr = float (tokens [col_idx [fdr_column ]])
349+ if switch_fdr > fdr_threshold :
350+ # Skip lines with FDR above threshold
351+ continue
352+ # Get gene ID and isoform ID
353+ _k = tokens [col_idx [geneid_column ]] # _k represents gene_id
354+ if _k not in isoform_switches :
355+ isoform_switches [_k ] = []
356+ isoform_switches [_k ].append (
357+ tokens [col_idx [isoformid_column ]]
358+ )
359+ log ("Finished parsing ISA results file: {0}" .format (file ))
360+ return isoform_switches
361+
362+
363+ def get_with_default (line_list , column_name_idx_dict , column_name , default_value = "NA" ):
364+ """Get a value from a list using the column name index
365+ dictionary. If the column name does not exist in the
366+ dictionary, return the default value (i.e "NA").
367+ @param line_list <list[str]>:
368+ List of values from a line in a file. This is the
369+ list that get are retrieving information from.
370+ This function is used to return a value (with a
371+ default value if missing) from within a list or
372+ dictionary comprehension.
373+ @param column_name_idx_dict <dict[str]=int>:
374+ Dictionary mapping column names to their index
375+ @param column_name <str>:
376+ Column name to look up in the dictionary
377+ @param default_value <str>:
378+ Default value to return if the column name does not
379+ exist in the dictionary. Defaults to "NA".
380+ @return value <str>:
381+ Value from the list at the index of the column name,
382+ or the default value if the column name does not exist.
383+ """
384+ # Default value to return if column name DNE
385+ parsed_value = default_value
386+ # Try to get the index of the column name
387+ # This can result in a KeyError/IndexError
388+ # if the column name does not exist in the
389+ # dict.
390+ if column_name in column_name_idx_dict :
391+ # Get the index of the column name
392+ # from the dictionary.
393+ # This will raise a KeyError if the
394+ # column name does not exist in the dict.
395+ list_idx = column_name_idx_dict [column_name ]
396+ if list_idx < len (line_list ):
397+ # If the index is within the bounds of the list,
398+ # return the value at that index.
399+ parsed_value = line_list [list_idx ]
400+ # Remove quotes from the value
401+ parsed_value = stripped (parsed_value )
402+ return parsed_value
403+
404+
405+ def index_file (
406+ file , key = "gene_id" ,
407+ single_value_columns = ["gene_name" ],
408+ multi_value_columns = ["transcript_id" , "transcript_name" ]
409+ ):
410+ """Parses and indexes a file into a dictionary for quick
411+ lookups later. This file is indexed by keys representing
412+ columns defined via single_value_columns and multi_value_columns.
413+ If a columns is listed in the multi_value_columns list, it will
414+ return a list of values if there is a 1:M relationship.
415+ @param file <str>:
416+ File to parse and index. Must contain a header with
417+ the columns listed in keys and values. The index of
418+ these columns will be automatically resolved.
419+ @param key <str>:
420+ Key to use for the first level of the index. Defaults
421+ to "gene_id". This is the first key in the nested
422+ dictionary.
423+ @param single_value_columns <list[str]>:
424+ List of columns that contain single values per key.
425+ These are the second keys in the nested dictionary.
426+ This returns a single value as a string.
427+ @param multi_value_columns <list[str]>:
428+ List of columns that contain multiple values per key.
429+ These values are stored as a list in the nested dictionary
430+ to accomodate 1:M relationship between the key and the
431+ values in this list.
432+ @return file_idx <dict[str]=str>:
433+ Nested dictionary where,
434+ • 1st_key = gene_id
435+ • 2nd_key = one of the following:
436+ gene_name|transcript_id|transcript_name
437+ @note: file_idx[transcript_id][2nd_ley] returns a
438+ either a string or a list, depending on whether the
439+ column is single_value_columns or multi_value_columns
440+ """
441+ log ("Started indexing input file: {0}" .format (file ))
442+ file_idx = {}
443+ # Handler for opening files, i.e.
444+ # uncompressed or gzip files
445+ open_func = gzip .open if file .endswith ('.gz' ) else open
446+ line_number = 0 # Used for error reporting
447+ with open_func (file , 'rt' ) as fh :
448+ header = next (fh )
449+ col_idx = index_header (header )
450+ for line in fh :
451+ # Increment line number
452+ line_number += 1
453+ # Split the line into columns
454+ tokens = line .strip ().split ('\t ' )
455+ _k = tokens [col_idx [key ]] # _k represents gene_id
456+ _v = {v : get_with_default (tokens ,col_idx ,v ) for v in single_value_columns }
457+ if _k not in file_idx :
458+ # Create a new entry in the index
459+ # with the single value columns
460+ # this prevents it from being
461+ # overwritten in every iteration.
462+ # The values in _v should be the
463+ # same on every line for each
464+ # transcript_id.
465+ file_idx [_k ] = _v
466+ # Parse multi-value columns
467+ for multi_v_key in multi_value_columns :
468+ if multi_v_key not in file_idx [_k ]:
469+ file_idx [_k ][multi_v_key ] = [] # init as empty list
470+ # Append the multi-value columns to the index
471+ file_idx [_k ][multi_v_key ].append (get_with_default (tokens ,col_idx ,multi_v_key ))
472+ log ("Finished indexing input file: {0} ({1} lines)" .format (file , line_number ))
473+ return file_idx
474+
475+
476+ def fasta (filename ):
477+ """
478+ Reads in a FASTA file and yields each of its entries.
479+ The generator yields each sequence identifier and its
480+ corresponding sequence to ensure a low memory profile.
481+ If a sequence occurs over multiple lines, the yielded
482+ sequence is concatenated.
483+ @param filename <str>:
484+ Path of FASTA file to read and parse
485+ @yield chrom, sequence <str>, <str>:
486+ Yields each seq id and seq in the FASTA file
487+ """
488+ log ("Started parsing FASTA file: {0}" .format (filename ))
489+ open_func = gzip .open if filename .endswith ('.gz' ) else open
490+ with open_func (filename , 'rt' ) as file :
491+ sequence , chrom = '' , ''
492+ for line in file :
493+ line = line .strip ()
494+ if line .startswith ('>' ) and sequence :
495+ # base case for additional entries
496+ yield chrom , sequence
497+ chrom = line [1 :] # remove the > symbol
498+ sequence = ''
499+ elif line .startswith ('>' ):
500+ # base case for first entry in fasta file
501+ chrom = line [1 :] # remove the > symbol
502+ else :
503+ # concatenate multi-line sequences
504+ sequence += line
505+ else :
506+ yield chrom , sequence
507+ log ("Finished parsing FASTA file: {0}" .format (filename ))
508+
509+
305510if __name__ == '__main__' :
306511 # Parse command line arguments
307512 args = parse_cli_arguments ()
308513
309- # Sanity check for usage
310- if len (sys .argv ) == 1 :
311- # Nothing was provided
312- fatal ('Invalid usage: {0} [-h] ...' .format (os .path .basename (sys .argv [0 ])))
313-
314514 log ("Running isoform sequences script with the following options: " , args )
315515 # Create output directory if
316516 # it does not exist
@@ -324,4 +524,88 @@ def index_header(file_header):
324524 )
325525 )
326526
327- log ("Finished running isoform sequences script!" )
527+ # Parse the isoform switch results file
528+ # where keys are gene IDs and values
529+ # are lists of isoform IDs that have
530+ # significant isoform switches
531+ top_isa_switches = parsed_isa_switching_genes (
532+ args .isoform_switch_results ,
533+ geneid_column = "gene_id" ,
534+ isoformid_column = "isoform_id" ,
535+ fdr_column = "isoform_switch_q_value" ,
536+ fdr_threshold = args .fdr_filter
537+ )
538+ log ("Found {0} genes with significant isoform switches." .format (len (top_isa_switches )))
539+ # Parse the splicing annotation file
540+ gene2isoform_dict = index_file (
541+ args .splicing_ann ,
542+ key = "gene_id" ,
543+ single_value_columns = ["gene_name" ],
544+ multi_value_columns = ["transcript_id" , "transcript_name" ]
545+ )
546+ log ('Indexed transcript information for {0} genes' .format (len (gene2isoform_dict )))
547+ # Create an index of transript_id
548+ # to gene_id mappings
549+ transcript2gene_dict = index_file (
550+ args .splicing_ann ,
551+ key = "transcript_id" ,
552+ single_value_columns = ["gene_id" ],
553+ multi_value_columns = []
554+ )
555+ log ('Indexed gene information for {0} transcripts' .format (len (transcript2gene_dict )))
556+ # Parse the transcripts FASTA file,
557+ # where keys are transcript IDs and values
558+ # are the corresponding concatnated sequences
559+ transcript_sequences = {}
560+ for chrom , sequence in fasta (args .transcripts_fasta ):
561+ # Extract transcript ID from the FASTA header
562+ # using the delimiter specified by the user
563+ transcript_id = chrom .split (args .fasta_delim )[0 ]
564+ transcript_sequences [transcript_id ] = sequence
565+ transcript_sequences [transcript_id ] = sequence
566+ log ("Parsed {0} transcripts from FASTA file." .format (len (transcript_sequences )))
567+ # Create a dictionary to group the
568+ # isoform switches by gene_id, ISA
569+ # has a tendency to report the gene_id
570+ # as the gene_name, meaning that gene_id
571+ # and gene_name are often the same.
572+ log ("Started grouping isoform switches by gene_id." )
573+ top_isa_switches_grouped_by_gene = {}
574+ for gene_id , transcripts_list in top_isa_switches .items ():
575+ # Group the isoform switches by gene_id
576+ for transcript_id in transcripts_list :
577+ _converted_gene_id = transcript2gene_dict .get (transcript_id , {}).get ("gene_id" , "NA" )
578+ if _converted_gene_id == "NA" :
579+ log ("Warning: Transcript ID '{0}' does not have a corresponding gene_id, skipping..." .format (transcript_id ))
580+ continue
581+ if _converted_gene_id not in top_isa_switches_grouped_by_gene :
582+ top_isa_switches_grouped_by_gene [_converted_gene_id ] = []
583+ top_isa_switches_grouped_by_gene [_converted_gene_id ].append (transcript_id )
584+ log ("Finished grouping isoform switches by {0} gene_id." .format (len (top_isa_switches_grouped_by_gene )))
585+ # Write the isoform sequences to the output file
586+ ofh_seq_number = 0
587+ with open (args .output , "w" ) as ofh :
588+ for gene_id in top_isa_switches_grouped_by_gene :
589+ # Get the the transcript_ids for
590+ # all the transcripts belonging to
591+ # the gene_id that has a significant
592+ # isoform switch
593+ isoform_list = gene2isoform_dict .get (gene_id , {}).get ("transcript_id" , [])
594+ gene_name = gene2isoform_dict .get (gene_id , {}).get ("gene_name" , "NA" )
595+ # Write each isoform sequence to the output file
596+ for isoform_id in isoform_list :
597+ if isoform_id not in transcript_sequences :
598+ log ("Warning: Transcript ID '{0}' not found in FASTA file, skipping." .format (isoform_id ))
599+ continue
600+ sequence = transcript_sequences [isoform_id ]
601+ # Output FASTA file format:
602+ # >transcript_id|gene_id|gene_name
603+ # ATGTTTACAGTACCCCAA
604+ ofh_seq_number += 1
605+ ofh .write (
606+ ">{0}{1}{2}{1}{3}\n {4}\n " .format (
607+ isoform_id , args .fasta_delim , gene_id , gene_name , sequence
608+ )
609+ )
610+ log ("Finished writing {0} isoform sequences across to output file: {1}" .format (ofh_seq_number , args .output ))
611+ log ("Finished running isoform sequences script!" )
0 commit comments