Skip to content

Commit bf0579c

Browse files
committed
Adding parsers and logic to find/write all the transcripts for an effected gene.
1 parent 30cfc9d commit bf0579c

1 file changed

Lines changed: 294 additions & 10 deletions

File tree

workflow/scripts/isoform_sequences.py

Lines changed: 294 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
[--fdr-filter FDR_FILTER] \\
1717
[--fasta-delim FASTA_DELIM] \\
1818
--isoform-switch-results ISOFORM_SWITCH_RESULTS \\
19-
--splicing-ann SPLICIN_ANN_FILE \\
19+
--splicing-ann SPLICING_ANN_FILE \\
2020
--transcripts-fasta TRANSCRIPTS_FASTA \\
2121
--output OUTPUT_FILE
2222
@About:
@@ -44,7 +44,7 @@
4444
• isoform_id
4545
• gene_id
4646
• isoform_switch_q_value
47-
-s, --splicing-ann SPLICIN_ANN_FILE
47+
-s, --splicing-ann SPLICING_ANN_FILE
4848
Splicing annotation file. This file should
4949
be a tab-separated file containing the
5050
following columns:
@@ -207,15 +207,15 @@ def parse_cli_arguments():
207207
formatter_class=argparse.RawDescriptionHelpFormatter,
208208
usage = argparse.SUPPRESS,
209209
)
210-
# Isoform switch results file
210+
# Input isoform switch results file
211211
parser.add_argument(
212212
'-i', '--isoform-switch-results',
213213
type = lambda file: \
214214
check_permissions(parser, file, os.R_OK),
215215
required=True,
216216
help=argparse.SUPPRESS
217217
)
218-
# Splicing annotation file
218+
# Input splicing annotation file
219219
parser.add_argument(
220220
'-s', '--splicing-ann',
221221
type = lambda file: \
@@ -302,15 +302,215 @@ def index_header(file_header):
302302
return idx
303303

304304

305+
def parsed_isa_switching_genes(
306+
file,
307+
geneid_column="gene_id",
308+
isoformid_column="isoform_id",
309+
fdr_column="isoform_switch_q_value",
310+
fdr_threshold=0.1
311+
):
312+
"""Parses an IsoformSwitchAnalyzeR results file
313+
and returns a dictionary of isoform switches
314+
with significant FDR values. The returned dict
315+
contains gene IDs as keys and a list of isoform
316+
IDs that have significant isoform switches as values.
317+
The FDR threshold is used to identify genes/isoforms
318+
with significant isoform switches.
319+
@param file <str>:
320+
Path to the IsoformSwitchAnalyzeR results file.
321+
@param geneid_column <str>:
322+
Column name for gene IDs in the results file.
323+
@param isoformid_column <str>:
324+
Column name for isoform IDs in the results file.
325+
@param fdr_column <str>:
326+
Column name for FDR values in the results file.
327+
@param fdr_threshold <float>:
328+
FDR threshold to use for filtering isoform switches
329+
@return isoform_switches <dict[str]=list[str]>:
330+
Dictionary of genes with significant isoform switches,
331+
where keys are gene IDs and values are lists of
332+
isoform IDs that have significant isoform switches.
333+
"""
334+
log("Starting to parse ISA results file: {0}".format(file))
335+
isoform_switches = {}
336+
# Handler for opening files, i.e.
337+
# uncompressed or gzip files
338+
open_func = gzip.open if file.endswith('.gz') else open
339+
line_number = 0 # Use for error reporting
340+
with open_func(file, 'rt') as fh:
341+
header = next(fh)
342+
col_idx = index_header(header)
343+
for line in fh:
344+
# Increment line number
345+
line_number += 1
346+
# Split the line into columns
347+
tokens = line.strip().split('\t')
348+
switch_fdr = float(tokens[col_idx[fdr_column]])
349+
if switch_fdr > fdr_threshold:
350+
# Skip lines with FDR above threshold
351+
continue
352+
# Get gene ID and isoform ID
353+
_k = tokens[col_idx[geneid_column]] # _k represents gene_id
354+
if _k not in isoform_switches:
355+
isoform_switches[_k] = []
356+
isoform_switches[_k].append(
357+
tokens[col_idx[isoformid_column]]
358+
)
359+
log("Finished parsing ISA results file: {0}".format(file))
360+
return isoform_switches
361+
362+
363+
def get_with_default(line_list, column_name_idx_dict, column_name, default_value="NA"):
364+
"""Get a value from a list using the column name index
365+
dictionary. If the column name does not exist in the
366+
dictionary, return the default value (i.e "NA").
367+
@param line_list <list[str]>:
368+
List of values from a line in a file. This is the
369+
list that get are retrieving information from.
370+
This function is used to return a value (with a
371+
default value if missing) from within a list or
372+
dictionary comprehension.
373+
@param column_name_idx_dict <dict[str]=int>:
374+
Dictionary mapping column names to their index
375+
@param column_name <str>:
376+
Column name to look up in the dictionary
377+
@param default_value <str>:
378+
Default value to return if the column name does not
379+
exist in the dictionary. Defaults to "NA".
380+
@return value <str>:
381+
Value from the list at the index of the column name,
382+
or the default value if the column name does not exist.
383+
"""
384+
# Default value to return if column name DNE
385+
parsed_value = default_value
386+
# Try to get the index of the column name
387+
# This can result in a KeyError/IndexError
388+
# if the column name does not exist in the
389+
# dict.
390+
if column_name in column_name_idx_dict:
391+
# Get the index of the column name
392+
# from the dictionary.
393+
# This will raise a KeyError if the
394+
# column name does not exist in the dict.
395+
list_idx = column_name_idx_dict[column_name]
396+
if list_idx < len(line_list):
397+
# If the index is within the bounds of the list,
398+
# return the value at that index.
399+
parsed_value = line_list[list_idx]
400+
# Remove quotes from the value
401+
parsed_value = stripped(parsed_value)
402+
return parsed_value
403+
404+
405+
def index_file(
406+
file, key = "gene_id",
407+
single_value_columns=["gene_name"],
408+
multi_value_columns=["transcript_id", "transcript_name"]
409+
):
410+
"""Parses and indexes a file into a dictionary for quick
411+
lookups later. This file is indexed by keys representing
412+
columns defined via single_value_columns and multi_value_columns.
413+
If a columns is listed in the multi_value_columns list, it will
414+
return a list of values if there is a 1:M relationship.
415+
@param file <str>:
416+
File to parse and index. Must contain a header with
417+
the columns listed in keys and values. The index of
418+
these columns will be automatically resolved.
419+
@param key <str>:
420+
Key to use for the first level of the index. Defaults
421+
to "gene_id". This is the first key in the nested
422+
dictionary.
423+
@param single_value_columns <list[str]>:
424+
List of columns that contain single values per key.
425+
These are the second keys in the nested dictionary.
426+
This returns a single value as a string.
427+
@param multi_value_columns <list[str]>:
428+
List of columns that contain multiple values per key.
429+
These values are stored as a list in the nested dictionary
430+
to accomodate 1:M relationship between the key and the
431+
values in this list.
432+
@return file_idx <dict[str]=str>:
433+
Nested dictionary where,
434+
• 1st_key = gene_id
435+
• 2nd_key = one of the following:
436+
gene_name|transcript_id|transcript_name
437+
@note: file_idx[transcript_id][2nd_ley] returns a
438+
either a string or a list, depending on whether the
439+
column is single_value_columns or multi_value_columns
440+
"""
441+
log("Started indexing input file: {0}".format(file))
442+
file_idx = {}
443+
# Handler for opening files, i.e.
444+
# uncompressed or gzip files
445+
open_func = gzip.open if file.endswith('.gz') else open
446+
line_number = 0 # Used for error reporting
447+
with open_func(file, 'rt') as fh:
448+
header = next(fh)
449+
col_idx = index_header(header)
450+
for line in fh:
451+
# Increment line number
452+
line_number += 1
453+
# Split the line into columns
454+
tokens = line.strip().split('\t')
455+
_k = tokens[col_idx[key]] # _k represents gene_id
456+
_v = {v: get_with_default(tokens,col_idx,v) for v in single_value_columns}
457+
if _k not in file_idx:
458+
# Create a new entry in the index
459+
# with the single value columns
460+
# this prevents it from being
461+
# overwritten in every iteration.
462+
# The values in _v should be the
463+
# same on every line for each
464+
# transcript_id.
465+
file_idx[_k] = _v
466+
# Parse multi-value columns
467+
for multi_v_key in multi_value_columns:
468+
if multi_v_key not in file_idx[_k]:
469+
file_idx[_k][multi_v_key] = [] # init as empty list
470+
# Append the multi-value columns to the index
471+
file_idx[_k][multi_v_key].append(get_with_default(tokens,col_idx,multi_v_key))
472+
log("Finished indexing input file: {0} ({1} lines)".format(file, line_number))
473+
return file_idx
474+
475+
476+
def fasta(filename):
477+
"""
478+
Reads in a FASTA file and yields each of its entries.
479+
The generator yields each sequence identifier and its
480+
corresponding sequence to ensure a low memory profile.
481+
If a sequence occurs over multiple lines, the yielded
482+
sequence is concatenated.
483+
@param filename <str>:
484+
Path of FASTA file to read and parse
485+
@yield chrom, sequence <str>, <str>:
486+
Yields each seq id and seq in the FASTA file
487+
"""
488+
log("Started parsing FASTA file: {0}".format(filename))
489+
open_func = gzip.open if filename.endswith('.gz') else open
490+
with open_func(filename, 'rt') as file:
491+
sequence, chrom = '', ''
492+
for line in file:
493+
line = line.strip()
494+
if line.startswith('>') and sequence:
495+
# base case for additional entries
496+
yield chrom, sequence
497+
chrom = line[1:] # remove the > symbol
498+
sequence = ''
499+
elif line.startswith('>'):
500+
# base case for first entry in fasta file
501+
chrom = line[1:] # remove the > symbol
502+
else:
503+
# concatenate multi-line sequences
504+
sequence += line
505+
else:
506+
yield chrom, sequence
507+
log("Finished parsing FASTA file: {0}".format(filename))
508+
509+
305510
if __name__ == '__main__':
306511
# Parse command line arguments
307512
args = parse_cli_arguments()
308513

309-
# Sanity check for usage
310-
if len(sys.argv) == 1:
311-
# Nothing was provided
312-
fatal('Invalid usage: {0} [-h] ...'.format(os.path.basename(sys.argv[0])))
313-
314514
log("Running isoform sequences script with the following options: ", args)
315515
# Create output directory if
316516
# it does not exist
@@ -324,4 +524,88 @@ def index_header(file_header):
324524
)
325525
)
326526

327-
log("Finished running isoform sequences script!")
527+
# Parse the isoform switch results file
528+
# where keys are gene IDs and values
529+
# are lists of isoform IDs that have
530+
# significant isoform switches
531+
top_isa_switches = parsed_isa_switching_genes(
532+
args.isoform_switch_results,
533+
geneid_column="gene_id",
534+
isoformid_column="isoform_id",
535+
fdr_column="isoform_switch_q_value",
536+
fdr_threshold=args.fdr_filter
537+
)
538+
log("Found {0} genes with significant isoform switches.".format(len(top_isa_switches)))
539+
# Parse the splicing annotation file
540+
gene2isoform_dict = index_file(
541+
args.splicing_ann,
542+
key="gene_id",
543+
single_value_columns=["gene_name"],
544+
multi_value_columns=["transcript_id", "transcript_name"]
545+
)
546+
log('Indexed transcript information for {0} genes'.format(len(gene2isoform_dict)))
547+
# Create an index of transript_id
548+
# to gene_id mappings
549+
transcript2gene_dict = index_file(
550+
args.splicing_ann,
551+
key="transcript_id",
552+
single_value_columns=["gene_id"],
553+
multi_value_columns=[]
554+
)
555+
log('Indexed gene information for {0} transcripts'.format(len(transcript2gene_dict)))
556+
# Parse the transcripts FASTA file,
557+
# where keys are transcript IDs and values
558+
# are the corresponding concatnated sequences
559+
transcript_sequences = {}
560+
for chrom, sequence in fasta(args.transcripts_fasta):
561+
# Extract transcript ID from the FASTA header
562+
# using the delimiter specified by the user
563+
transcript_id = chrom.split(args.fasta_delim)[0]
564+
transcript_sequences[transcript_id] = sequence
565+
transcript_sequences[transcript_id] = sequence
566+
log("Parsed {0} transcripts from FASTA file.".format(len(transcript_sequences)))
567+
# Create a dictionary to group the
568+
# isoform switches by gene_id, ISA
569+
# has a tendency to report the gene_id
570+
# as the gene_name, meaning that gene_id
571+
# and gene_name are often the same.
572+
log("Started grouping isoform switches by gene_id.")
573+
top_isa_switches_grouped_by_gene = {}
574+
for gene_id, transcripts_list in top_isa_switches.items():
575+
# Group the isoform switches by gene_id
576+
for transcript_id in transcripts_list:
577+
_converted_gene_id = transcript2gene_dict.get(transcript_id, {}).get("gene_id", "NA")
578+
if _converted_gene_id == "NA":
579+
log("Warning: Transcript ID '{0}' does not have a corresponding gene_id, skipping...".format(transcript_id))
580+
continue
581+
if _converted_gene_id not in top_isa_switches_grouped_by_gene:
582+
top_isa_switches_grouped_by_gene[_converted_gene_id] = []
583+
top_isa_switches_grouped_by_gene[_converted_gene_id].append(transcript_id)
584+
log("Finished grouping isoform switches by {0} gene_id.".format(len(top_isa_switches_grouped_by_gene)))
585+
# Write the isoform sequences to the output file
586+
ofh_seq_number = 0
587+
with open(args.output, "w") as ofh:
588+
for gene_id in top_isa_switches_grouped_by_gene:
589+
# Get the the transcript_ids for
590+
# all the transcripts belonging to
591+
# the gene_id that has a significant
592+
# isoform switch
593+
isoform_list = gene2isoform_dict.get(gene_id, {}).get("transcript_id", [])
594+
gene_name = gene2isoform_dict.get(gene_id, {}).get("gene_name", "NA")
595+
# Write each isoform sequence to the output file
596+
for isoform_id in isoform_list:
597+
if isoform_id not in transcript_sequences:
598+
log("Warning: Transcript ID '{0}' not found in FASTA file, skipping.".format(isoform_id))
599+
continue
600+
sequence = transcript_sequences[isoform_id]
601+
# Output FASTA file format:
602+
# >transcript_id|gene_id|gene_name
603+
# ATGTTTACAGTACCCCAA
604+
ofh_seq_number += 1
605+
ofh.write(
606+
">{0}{1}{2}{1}{3}\n{4}\n".format(
607+
isoform_id, args.fasta_delim, gene_id, gene_name, sequence
608+
)
609+
)
610+
log("Finished writing {0} isoform sequences across to output file: {1}".format(ofh_seq_number, args.output))
611+
log("Finished running isoform sequences script!")

0 commit comments

Comments
 (0)