Skip to content

Commit 299a4e4

Browse files
committed
Refactoring annotation lookups to reduce redundancy
1 parent 6f1a412 commit 299a4e4

1 file changed

Lines changed: 72 additions & 18 deletions

File tree

workflow/scripts/leafcutter_annotation.py

Lines changed: 72 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,54 @@ def index_file(file, keys, key_delim, values):
291291
return file_idx
292292

293293

294+
def parse_intron(line_list, intron_idx):
295+
"""Parses intron information into tokens from a line list
296+
in the effect sizes file.
297+
@param line_list <list[str]>:
298+
List of tokens from a line in the effect sizes file,
299+
where each token represents a column in the file.
300+
@param intron_idx <int>:
301+
Index of the intron column in the line_list.
302+
The intron column is formatted as:
303+
"{chr}:{intron_start}:{intron_end}:{clust_id}"
304+
@return <tuple>:
305+
Returns a tuple of the form:
306+
(chromosome, intron_start, intron_end, cluster_id)
307+
where each element is a string.
308+
If the intron format is invalid, it returns None.
309+
"""
310+
intron = line_list[intron_idx]
311+
try:
312+
chrom, start, stop, cluster_id = intron.split(":")
313+
except ValueError:
314+
err("Warning: Invalid intron format in provided effect sizes file!")
315+
err(" └── Expected format = chr:intron_start:intron_end:clust_id")
316+
err(" └── Encountered format = {0}".format(intron))
317+
return None
318+
return (chrom, start, stop, cluster_id)
319+
320+
321+
def get_additional_annotation_information(annotation_dict, first_key, values):
322+
"""Get additional annotation information from a nested
323+
dictionary using the first_key and each value in values
324+
as a composite key. Returns a list of values corresponding
325+
to the provided (first_key, value) pairs in annotation_dict.
326+
If a key is not found, it returns "NA" for that value.
327+
@param annotation_dict <dict>:
328+
Nested dictionary containing additional annotation information
329+
keyed by [first_key][v] where v is an element in values.
330+
@param first_key <str>:
331+
First key in the nested dictionary to use for lookups.
332+
@param values <list[str]>:
333+
List of values to retrieve from the dictionary. This is
334+
the second key in the nested dictionary.
335+
@return <list[str]>:
336+
Returns a list of values corresponding to the provided keys.
337+
If a key is not found, it returns "NA" for that value.
338+
"""
339+
return [annotation_dict.get(first_key, {}).get(v, "NA") for v in values]
340+
341+
294342
if __name__ == '__main__':
295343
# Parse command line arguments
296344
args = parse_cli_arguments()
@@ -348,29 +396,35 @@ def index_file(file, keys, key_delim, values):
348396
for line in ifh:
349397
# Split the line into columns
350398
tokens = line.rstrip().split('\t')
351-
# where intron column format:
399+
# Parse intron info, where intron column format:
352400
# {chr}:{intron_start}:{intron_end}:{clust_id}
353-
intron = tokens[intron_idx]
354-
ichrom, istart, istop, icluster_id = intron.split(":")
355-
# where cluster_signif look
356-
# up key = {chr}:{clust_id}
357-
_cluster_signif_values = []
358-
for v in PARSE_CLUSTER_SIGNIF:
359-
clust_signif_key = "{0}:{1}".format(ichrom, icluster_id)
360-
try: parsed_clust_v = cluster_signif_dict[clust_signif_key][v]
361-
except KeyError: parsed_clust_v = "NA"
362-
_cluster_signif_values.append(parsed_clust_v)
401+
intron = parse_intron(tokens, intron_idx) # returns (chr,intron_start,intron_end,clust_id)
402+
if intron is None: continue # invalid intron format
403+
intron_chrom, intron_start, intron_stop, intron_cluster_id = intron
404+
# Get additional cluster signif info,
405+
# where cluster_signif look up
406+
# first_key = {chr}:{clust_id}
407+
# and second_key is each element in
408+
# PARSE_CLUSTER_SIGNIF
409+
_cluster_signif_values = get_additional_annotation_information(
410+
cluster_signif_dict,
411+
"{0}:{1}".format(intron_chrom, intron_cluster_id),
412+
PARSE_CLUSTER_SIGNIF
413+
)
363414
# Check if cluster meets FDR threshold
364415
try: fdr = float(_cluster_signif_values[ADJ_P_COLUMN_IDX])
365416
except ValueError: continue # value cannot be type cast, i.e NA
366417
if fdr > float(args.fdr_filter): continue # does not meet filter
367-
# where intron_ann look
368-
# up key = {chr}:{intron_start}:{intron_end}:{clust_id}
369-
_intron_ann_values = []
370-
for v in PARSE_INTRON_ANN:
371-
try: parsed_intron_v = intron_ann_dict[intron][v]
372-
except KeyError: parsed_intron_v = "NA"
373-
_intron_ann_values.append(parsed_intron_v)
418+
# Get additional intron info, where
419+
# intron_ann first_key:
420+
# {chr}:{intron_start}:{intron_end}:{clust_id}
421+
# and second_key is each element in
422+
# PARSE_INTRON_ANN
423+
_intron_ann_values = get_additional_annotation_information(
424+
intron_ann_dict,
425+
":".join(intron),
426+
PARSE_INTRON_ANN
427+
)
374428
# Write annotated line to output
375429
_output_line = "{0}\t{1}\t{2}".format(
376430
"\t".join(tokens),

0 commit comments

Comments
 (0)