Skip to content

Commit b0c4026

Browse files
committed
Adding defaults for missing values and write output annoation
1 parent 2806d03 commit b0c4026

1 file changed

Lines changed: 123 additions & 39 deletions

File tree

workflow/scripts/splicing_annotation.py

Lines changed: 123 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import argparse, gzip, os, sys
1010

1111
# Constants
12-
# Usage and help section
12+
# Usage and help section
1313
_HELP = dedent("""
1414
@Usage:
1515
$ ./splicing_annotation.py [-h] [--version] \\
@@ -28,7 +28,7 @@
2828
• exon_id.1|exon_id.2|...
2929
• exon_number.1|exon_number.2|...
3030
• exon_seqname
31-
• exon_start:exon_end.1|exon_start.2|...
31+
• exon_start:exon_end.1|exon_start.2:exon_end.2|...
3232
• exon_strand
3333
3434
This file has 1:M exon information collapsed by
@@ -216,9 +216,9 @@ def index_header(file_header):
216216

217217

218218
def index_file(
219-
file, key = "transcript_id",
220-
single_value_columns=["gene_id", "gene_name", "transcript_name"],
221-
multi_value_columns=["exon_id", "exon_number", "exon_seqname","exon_start", "exon_end", "exon_strand"],
219+
file, key = "transcript_id",
220+
single_value_columns=["transcript_name", "gene_id", "gene_name"],
221+
multi_value_columns=["exon_id", "exon_number", "exon_seqname","exon_start_end", "exon_strand"],
222222
multi_value_key_name="exon_metadata"
223223
):
224224
"""Parses and indexes a file into a dictionary for quick
@@ -271,7 +271,7 @@ def index_file(
271271
# Handler for opening files, i.e.
272272
# uncompressed or gzip files
273273
open_func = gzip.open if file.endswith('.gz') else open
274-
line_number = 0 # Used for error reporting
274+
line_number = 0 # Used for error reporting
275275
with open_func(file, 'rt') as fh:
276276
header = next(fh)
277277
col_idx = index_header(header)
@@ -283,8 +283,8 @@ def index_file(
283283
# Concatenate mutiple keys into
284284
# a single key separated by the
285285
# key_delim character
286-
_k = tokens[col_idx[key]]
287-
_v = {v: tokens[col_idx[v]] for v in single_value_columns if v in col_idx}
286+
_k = tokens[col_idx[key]] # _k represents transcript_id
287+
_v = {v: get_with_default(tokens,col_idx,v) for v in single_value_columns}
288288
if _k not in file_idx:
289289
# Create a new entry in the index
290290
# with the single value columns
@@ -295,60 +295,90 @@ def index_file(
295295
# transcript_id.
296296
file_idx[_k] = _v
297297
# Parse multi-value columns
298-
_multi_values = [tokens[col_idx[v]] for v in multi_value_columns if v in col_idx]
298+
_multi_values = [get_with_default(tokens,col_idx,v) for v in multi_value_columns]
299299
# Add multi-value columns to the
300300
# file_idx under the multi_value_key_name
301301
if multi_value_key_name not in file_idx[_k]:
302302
file_idx[_k][multi_value_key_name] = []
303303
# Append the multi-value columns
304304
file_idx[_k][multi_value_key_name].append(_multi_values)
305305
log("Finished indexing input file: {0} ({1} lines)".format(file, line_number))
306-
return file_idx
306+
return file_idx
307307

308308

309-
def get_additional_annotation_information(annotation_dict, first_key, values):
310-
"""Get additional annotation information from a nested
311-
dictionary using the first_key and each value in values
312-
as a composite key. Returns a list of values corresponding
313-
to the provided (first_key, value) pairs in annotation_dict.
314-
If a key is not found, it returns "NA" for that value.
315-
@param annotation_dict <dict>:
316-
Nested dictionary containing additional annotation information
317-
keyed by [first_key][v] where v is an element in values.
318-
@param first_key <str>:
319-
First key in the nested dictionary to use for lookups.
320-
@param values <list[str]>:
321-
List of values to retrieve from the dictionary. This is
322-
the second key in the nested dictionary.
323-
@return <list[str]>:
324-
Returns a list of values corresponding to the provided keys.
325-
If a key is not found, it returns "NA" for that value.
309+
def get_with_default(line_list, column_name_idx_dict, column_name, default_value="NA"):
310+
"""Get a value from a list using the column name index
311+
dictionary. If the column name does not exist in the
312+
dictionary, return the default value (i.e "NA").
313+
@param line_list <list[str]>:
314+
List of values from a line in a file. This is the
315+
list that get are retrieving information from.
316+
This function is used to return a value (with a
317+
default value if missing) from within a list or
318+
dictionary comprehension.
319+
@param column_name_idx_dict <dict[str]=int>:
320+
Dictionary mapping column names to their index
321+
@param column_name <str>:
322+
Column name to look up in the dictionary
323+
@param default_value <str>:
324+
Default value to return if the column name does not
325+
exist in the dictionary. Defaults to "NA".
326+
@return value <str>:
327+
Value from the list at the index of the column name,
328+
or the default value if the column name does not exist.
326329
"""
327-
return [annotation_dict.get(first_key, {}).get(v, "NA") for v in values]
330+
# Default value to return if column name DNE
331+
parsed_value = default_value
332+
# Try to get the index of the column name
333+
# This can result in a KeyError/IndexError
334+
# if the column name does not exist in the
335+
# dict. This can be caused upstream of this
336+
# program if this 9th column in the GTF file
337+
# is missing attributes that we expect to be
338+
# present, i.e gene_id, gene_name, exon_number,
339+
# etc. These should be present in 99% of GTF
340+
# files, but it is possible that some files do
341+
# not contain these attributes. Of all the
342+
# attributes that are expected, the exon_number
343+
# may not always be present; however, that is
344+
# okay. It isn't be used in a meaningful way.
345+
if column_name in column_name_idx_dict:
346+
# Get the index of the column name
347+
# from the dictionary.
348+
# This will raise a KeyError if the
349+
# column name does not exist in the dict.
350+
list_idx = column_name_idx_dict[column_name]
351+
if list_idx < len(line_list):
352+
# If the index is within the bounds of the list,
353+
# return the value at that index.
354+
parsed_value = line_list[list_idx]
355+
# Remove quotes from the value
356+
parsed_value = stripped(parsed_value)
357+
return parsed_value
328358

329359

330360
if __name__ == '__main__':
331361
# Parse command line arguments
332362
args = parse_cli_arguments()
333-
363+
334364
# Sanity check for usage
335365
if len(sys.argv) == 1:
336366
# Nothing was provided
337367
fatal('Invalid usage: {0} [-h] ...'.format(os.path.basename(sys.argv[0])))
338-
368+
339369
log("Running splicing annotation script with the following options: ", args)
340370
# Create output directory if
341371
# it does not exist
342372
output_dir = os.path.abspath(os.path.dirname(args.output))
343373
if not os.path.exists(output_dir):
344374
try: os.makedirs(output_dir)
345-
except OSError as e:
375+
except OSError as e:
346376
fatal(
347377
"Fatal error: Failed to create output directory: {0}\n{1}".format(
348378
output_dir, e
349379
)
350380
)
351-
381+
352382
# Parse and collapse exon annotation
353383
# information for each transcript,
354384
# where 1:M exon information is
@@ -358,9 +388,9 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
358388
FIRST_KEY = "transcript_id"
359389
EXON_1toM_KEY = "exon_metadata"
360390
PARSE_1to1_COLUMNS = [
391+
"transcript_name",
361392
"gene_id",
362-
"gene_name",
363-
"transcript_name"
393+
"gene_name"
364394
]
365395
PARSE_1toM_COLUMNS = [
366396
"exon_id",
@@ -372,7 +402,7 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
372402
]
373403
splicing_dict = index_file(
374404
file=args.exon_ann,
375-
key = FIRST_KEY,
405+
key = FIRST_KEY,
376406
single_value_columns=PARSE_1to1_COLUMNS,
377407
multi_value_columns=PARSE_1toM_COLUMNS,
378408
multi_value_key_name=EXON_1toM_KEY
@@ -385,16 +415,70 @@ def get_additional_annotation_information(annotation_dict, first_key, values):
385415
# Sort the list of lists by:
386416
# seqname, start, end, strand
387417
v[EXON_1toM_KEY].sort(
388-
key=lambda x: (x[2], int(x[3]), int(x[4]), x[5])
418+
key=lambda x: (
419+
x[PARSE_1toM_COLUMNS.index("exon_seqname")],
420+
int(x[PARSE_1toM_COLUMNS.index("exon_start")]),
421+
int(x[PARSE_1toM_COLUMNS.index("exon_end")]),
422+
x[PARSE_1toM_COLUMNS.index("exon_strand")]
423+
)
389424
)
390425
# Get the strand information
391426
# for the first exon in the list
392427
# to determine if the order
393428
# needs to be reversed.
394-
if v[EXON_1toM_KEY][0][5] == "-":
395-
# If the strand is negative,
429+
if v[EXON_1toM_KEY][0][PARSE_1toM_COLUMNS.index("exon_strand")] == "-":
430+
# If the strand is negative,
396431
# reverse the order of the exon
397432
# information to reflect the
398433
# correct splicing order.
399434
v[EXON_1toM_KEY].reverse()
400-
log("Finished sorting exon information by seqname, start, end, strand")
435+
log("Finished sorting exon information by seqname, start, end, strand")
436+
# Write the splicing annotation to the output file
437+
log("Writing splicing annotation to output file: ", args.output)
438+
MISSING_VALUES = ["NA", "Unknown", ""]
439+
with open(args.output, 'w') as out_fh:
440+
# Write the header line
441+
out_fh.write(
442+
"\t".join([
443+
"transcript_id","transcript_name",
444+
"gene_id","gene_name",
445+
"exon_id","exon_number",
446+
"exon_seqname","exon_start_end",
447+
"exon_strand"
448+
]) + "\n"
449+
)
450+
# Write the splicing annotation
451+
for transcript_id, data in splicing_dict.items():
452+
exon_metadata = data[EXON_1toM_KEY]
453+
exon_ids = "|".join([x[PARSE_1toM_COLUMNS.index("exon_id")] for x in exon_metadata])
454+
# Replace missing values with their
455+
# index after coordinate sorting
456+
exon_numbers = [x[PARSE_1toM_COLUMNS.index("exon_number")] for x in exon_metadata]
457+
exon_numbers = [i if e in MISSING_VALUES else e for i,e in enumerate(exon_numbers)]
458+
exon_numbers = "|".join(exon_numbers)
459+
# NOTE: All strand information should
460+
# be the same for a given transcript.
461+
exon_seqnames = "|".join(list(set([x[PARSE_1toM_COLUMNS.index("exon_seqname")] for x in exon_metadata])))
462+
exon_start_ends = "|".join(
463+
["{0}:{1}".format(
464+
x[PARSE_1toM_COLUMNS.index("exon_start")], x[PARSE_1toM_COLUMNS.index("exon_end")]
465+
) for x in exon_metadata]
466+
)
467+
# NOTE: All strand information should
468+
# be the same for a given transcript.
469+
exon_strands = "|".join(list(set([x[PARSE_1toM_COLUMNS.index("exon_strand")] for x in exon_metadata])))
470+
out_fh.write(
471+
"\t".join([
472+
transcript_id,
473+
data["transcript_name"],
474+
data["gene_id"],
475+
data["gene_name"],
476+
exon_ids,
477+
exon_numbers,
478+
exon_seqnames,
479+
exon_start_ends,
480+
exon_strands
481+
]) + "\n"
482+
)
483+
log("Finished writing splicing annotation to output file: ", args.output)
484+
log("Splicing annotation script completed successfully!")

0 commit comments

Comments
 (0)