99import argparse , gzip , os , sys
1010
1111# Constants
12- # Usage and help section
12+ # Usage and help section
1313_HELP = dedent ("""
1414@Usage:
1515 $ ./splicing_annotation.py [-h] [--version] \\
2828 • exon_id.1|exon_id.2|...
2929 • exon_number.1|exon_number.2|...
3030 • exon_seqname
31- • exon_start:exon_end.1|exon_start.2|...
31+ • exon_start:exon_end.1|exon_start.2:exon_end.2 |...
3232 • exon_strand
3333
3434 This file has 1:M exon information collapsed by
@@ -216,9 +216,9 @@ def index_header(file_header):
216216
217217
218218def index_file (
219- file , key = "transcript_id" ,
220- single_value_columns = ["gene_id " , "gene_name " , "transcript_name " ],
221- multi_value_columns = ["exon_id" , "exon_number" , "exon_seqname" ,"exon_start" , "exon_end " , "exon_strand" ],
219+ file , key = "transcript_id" ,
220+ single_value_columns = ["transcript_name " , "gene_id " , "gene_name " ],
221+ multi_value_columns = ["exon_id" , "exon_number" , "exon_seqname" ,"exon_start_end " , "exon_strand" ],
222222 multi_value_key_name = "exon_metadata"
223223 ):
224224 """Parses and indexes a file into a dictionary for quick
@@ -271,7 +271,7 @@ def index_file(
271271 # Handler for opening files, i.e.
272272 # uncompressed or gzip files
273273 open_func = gzip .open if file .endswith ('.gz' ) else open
274- line_number = 0 # Used for error reporting
274+ line_number = 0 # Used for error reporting
275275 with open_func (file , 'rt' ) as fh :
276276 header = next (fh )
277277 col_idx = index_header (header )
@@ -283,8 +283,8 @@ def index_file(
283283 # Concatenate mutiple keys into
284284 # a single key separated by the
285285 # key_delim character
286- _k = tokens [col_idx [key ]]
287- _v = {v : tokens [ col_idx [ v ]] for v in single_value_columns if v in col_idx }
286+ _k = tokens [col_idx [key ]] # _k represents transcript_id
287+ _v = {v : get_with_default ( tokens , col_idx , v ) for v in single_value_columns }
288288 if _k not in file_idx :
289289 # Create a new entry in the index
290290 # with the single value columns
@@ -295,60 +295,90 @@ def index_file(
295295 # transcript_id.
296296 file_idx [_k ] = _v
297297 # Parse multi-value columns
298- _multi_values = [tokens [ col_idx [ v ]] for v in multi_value_columns if v in col_idx ]
298+ _multi_values = [get_with_default ( tokens , col_idx , v ) for v in multi_value_columns ]
299299 # Add multi-value columns to the
300300 # file_idx under the multi_value_key_name
301301 if multi_value_key_name not in file_idx [_k ]:
302302 file_idx [_k ][multi_value_key_name ] = []
303303 # Append the multi-value columns
304304 file_idx [_k ][multi_value_key_name ].append (_multi_values )
305305 log ("Finished indexing input file: {0} ({1} lines)" .format (file , line_number ))
306- return file_idx
306+ return file_idx
307307
308308
309- def get_additional_annotation_information (annotation_dict , first_key , values ):
310- """Get additional annotation information from a nested
311- dictionary using the first_key and each value in values
312- as a composite key. Returns a list of values corresponding
313- to the provided (first_key, value) pairs in annotation_dict.
314- If a key is not found, it returns "NA" for that value.
315- @param annotation_dict <dict>:
316- Nested dictionary containing additional annotation information
317- keyed by [first_key][v] where v is an element in values.
318- @param first_key <str>:
319- First key in the nested dictionary to use for lookups.
320- @param values <list[str]>:
321- List of values to retrieve from the dictionary. This is
322- the second key in the nested dictionary.
323- @return <list[str]>:
324- Returns a list of values corresponding to the provided keys.
325- If a key is not found, it returns "NA" for that value.
309+ def get_with_default (line_list , column_name_idx_dict , column_name , default_value = "NA" ):
310+ """Get a value from a list using the column name index
311+ dictionary. If the column name does not exist in the
312+ dictionary, return the default value (i.e "NA").
313+ @param line_list <list[str]>:
314+ List of values from a line in a file. This is the
315+ list that get are retrieving information from.
316+ This function is used to return a value (with a
317+ default value if missing) from within a list or
318+ dictionary comprehension.
319+ @param column_name_idx_dict <dict[str]=int>:
320+ Dictionary mapping column names to their index
321+ @param column_name <str>:
322+ Column name to look up in the dictionary
323+ @param default_value <str>:
324+ Default value to return if the column name does not
325+ exist in the dictionary. Defaults to "NA".
326+ @return value <str>:
327+ Value from the list at the index of the column name,
328+ or the default value if the column name does not exist.
326329 """
327- return [annotation_dict .get (first_key , {}).get (v , "NA" ) for v in values ]
330+ # Default value to return if column name DNE
331+ parsed_value = default_value
332+ # Try to get the index of the column name
333+ # This can result in a KeyError/IndexError
334+ # if the column name does not exist in the
335+ # dict. This can be caused upstream of this
336+ # program if this 9th column in the GTF file
337+ # is missing attributes that we expect to be
338+ # present, i.e gene_id, gene_name, exon_number,
339+ # etc. These should be present in 99% of GTF
340+ # files, but it is possible that some files do
341+ # not contain these attributes. Of all the
342+ # attributes that are expected, the exon_number
343+ # may not always be present; however, that is
344+ # okay. It isn't be used in a meaningful way.
345+ if column_name in column_name_idx_dict :
346+ # Get the index of the column name
347+ # from the dictionary.
348+ # This will raise a KeyError if the
349+ # column name does not exist in the dict.
350+ list_idx = column_name_idx_dict [column_name ]
351+ if list_idx < len (line_list ):
352+ # If the index is within the bounds of the list,
353+ # return the value at that index.
354+ parsed_value = line_list [list_idx ]
355+ # Remove quotes from the value
356+ parsed_value = stripped (parsed_value )
357+ return parsed_value
328358
329359
330360if __name__ == '__main__' :
331361 # Parse command line arguments
332362 args = parse_cli_arguments ()
333-
363+
334364 # Sanity check for usage
335365 if len (sys .argv ) == 1 :
336366 # Nothing was provided
337367 fatal ('Invalid usage: {0} [-h] ...' .format (os .path .basename (sys .argv [0 ])))
338-
368+
339369 log ("Running splicing annotation script with the following options: " , args )
340370 # Create output directory if
341371 # it does not exist
342372 output_dir = os .path .abspath (os .path .dirname (args .output ))
343373 if not os .path .exists (output_dir ):
344374 try : os .makedirs (output_dir )
345- except OSError as e :
375+ except OSError as e :
346376 fatal (
347377 "Fatal error: Failed to create output directory: {0}\n {1}" .format (
348378 output_dir , e
349379 )
350380 )
351-
381+
352382 # Parse and collapse exon annotation
353383 # information for each transcript,
354384 # where 1:M exon information is
@@ -358,9 +388,9 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
358388 FIRST_KEY = "transcript_id"
359389 EXON_1toM_KEY = "exon_metadata"
360390 PARSE_1to1_COLUMNS = [
391+ "transcript_name" ,
361392 "gene_id" ,
362- "gene_name" ,
363- "transcript_name"
393+ "gene_name"
364394 ]
365395 PARSE_1toM_COLUMNS = [
366396 "exon_id" ,
@@ -372,7 +402,7 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
372402 ]
373403 splicing_dict = index_file (
374404 file = args .exon_ann ,
375- key = FIRST_KEY ,
405+ key = FIRST_KEY ,
376406 single_value_columns = PARSE_1to1_COLUMNS ,
377407 multi_value_columns = PARSE_1toM_COLUMNS ,
378408 multi_value_key_name = EXON_1toM_KEY
@@ -385,16 +415,70 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
385415 # Sort the list of lists by:
386416 # seqname, start, end, strand
387417 v [EXON_1toM_KEY ].sort (
388- key = lambda x : (x [2 ], int (x [3 ]), int (x [4 ]), x [5 ])
418+ key = lambda x : (
419+ x [PARSE_1toM_COLUMNS .index ("exon_seqname" )],
420+ int (x [PARSE_1toM_COLUMNS .index ("exon_start" )]),
421+ int (x [PARSE_1toM_COLUMNS .index ("exon_end" )]),
422+ x [PARSE_1toM_COLUMNS .index ("exon_strand" )]
423+ )
389424 )
390425 # Get the strand information
391426 # for the first exon in the list
392427 # to determine if the order
393428 # needs to be reversed.
394- if v [EXON_1toM_KEY ][0 ][5 ] == "-" :
395- # If the strand is negative,
429+ if v [EXON_1toM_KEY ][0 ][PARSE_1toM_COLUMNS . index ( "exon_strand" ) ] == "-" :
430+ # If the strand is negative,
396431 # reverse the order of the exon
397432 # information to reflect the
398433 # correct splicing order.
399434 v [EXON_1toM_KEY ].reverse ()
400- log ("Finished sorting exon information by seqname, start, end, strand" )
435+ log ("Finished sorting exon information by seqname, start, end, strand" )
436+ # Write the splicing annotation to the output file
437+ log ("Writing splicing annotation to output file: " , args .output )
438+ MISSING_VALUES = ["NA" , "Unknown" , "" ]
439+ with open (args .output , 'w' ) as out_fh :
440+ # Write the header line
441+ out_fh .write (
442+ "\t " .join ([
443+ "transcript_id" ,"transcript_name" ,
444+ "gene_id" ,"gene_name" ,
445+ "exon_id" ,"exon_number" ,
446+ "exon_seqname" ,"exon_start_end" ,
447+ "exon_strand"
448+ ]) + "\n "
449+ )
450+ # Write the splicing annotation
451+ for transcript_id , data in splicing_dict .items ():
452+ exon_metadata = data [EXON_1toM_KEY ]
453+ exon_ids = "|" .join ([x [PARSE_1toM_COLUMNS .index ("exon_id" )] for x in exon_metadata ])
454+ # Replace missing values with their
455+ # index after coordinate sorting
456+ exon_numbers = [x [PARSE_1toM_COLUMNS .index ("exon_number" )] for x in exon_metadata ]
457+ exon_numbers = [i if e in MISSING_VALUES else e for i ,e in enumerate (exon_numbers )]
458+ exon_numbers = "|" .join (exon_numbers )
459+ # NOTE: All strand information should
460+ # be the same for a given transcript.
461+ exon_seqnames = "|" .join (list (set ([x [PARSE_1toM_COLUMNS .index ("exon_seqname" )] for x in exon_metadata ])))
462+ exon_start_ends = "|" .join (
463+ ["{0}:{1}" .format (
464+ x [PARSE_1toM_COLUMNS .index ("exon_start" )], x [PARSE_1toM_COLUMNS .index ("exon_end" )]
465+ ) for x in exon_metadata ]
466+ )
467+ # NOTE: All strand information should
468+ # be the same for a given transcript.
469+ exon_strands = "|" .join (list (set ([x [PARSE_1toM_COLUMNS .index ("exon_strand" )] for x in exon_metadata ])))
470+ out_fh .write (
471+ "\t " .join ([
472+ transcript_id ,
473+ data ["transcript_name" ],
474+ data ["gene_id" ],
475+ data ["gene_name" ],
476+ exon_ids ,
477+ exon_numbers ,
478+ exon_seqnames ,
479+ exon_start_ends ,
480+ exon_strands
481+ ]) + "\n "
482+ )
483+ log ("Finished writing splicing annotation to output file: " , args .output )
484+ log ("Splicing annotation script completed successfully!" )
0 commit comments