@@ -291,6 +291,54 @@ def index_file(file, keys, key_delim, values):
291291 return file_idx
292292
293293
294+ def parse_intron (line_list , intron_idx ):
295+ """Parses intron information into tokens from a line list
296+ in the effect sizes file.
297+ @param line_list <list[str]>:
298+ List of tokens from a line in the effect sizes file,
299+ where each token represents a column in the file.
300+ @param intron_idx <int>:
301+ Index of the intron column in the line_list.
302+ The intron column is formatted as:
303+ "{chr}:{intron_start}:{intron_end}:{clust_id}"
304+ @return <tuple>:
305+ Returns a tuple of the form:
306+ (chromosome, intron_start, intron_end, cluster_id)
307+ where each element is a string.
308+ If the intron format is invalid, it returns None.
309+ """
310+ intron = line_list [intron_idx ]
311+ try :
312+ chrom , start , stop , cluster_id = intron .split (":" )
313+ except ValueError :
314+ err ("Warning: Invalid intron format in provided effect sizes file!" )
315+ err (" └── Expected format = chr:intron_start:intron_end:clust_id" )
316+ err (" └── Encountered format = {0}" .format (intron ))
317+ return None
318+ return (chrom , start , stop , cluster_id )
319+
320+
321+ def get_additional_annotation_information (annotation_dict , first_key , values ):
322+ """Get additional annotation information from a nested
323+ dictionary using the first_key and each value in values
324+ as a composite key. Returns a list of values corresponding
325+ to the provided (first_key, value) pairs in annotation_dict.
326+ If a key is not found, it returns "NA" for that value.
327+ @param annotation_dict <dict>:
328+ Nested dictionary containing additional annotation information
329+ keyed by [first_key][v] where v is an element in values.
330+ @param first_key <str>:
331+ First key in the nested dictionary to use for lookups.
332+ @param values <list[str]>:
333+ List of values to retrieve from the dictionary. This is
334+ the second key in the nested dictionary.
335+ @return <list[str]>:
336+ Returns a list of values corresponding to the provided keys.
337+ If a key is not found, it returns "NA" for that value.
338+ """
339+ return [annotation_dict .get (first_key , {}).get (v , "NA" ) for v in values ]
340+
341+
294342if __name__ == '__main__' :
295343 # Parse command line arguments
296344 args = parse_cli_arguments ()
@@ -348,29 +396,35 @@ def index_file(file, keys, key_delim, values):
348396 for line in ifh :
349397 # Split the line into columns
350398 tokens = line .rstrip ().split ('\t ' )
351- # where intron column format:
399+ # Parse intron info, where intron column format:
352400 # {chr}:{intron_start}:{intron_end}:{clust_id}
353- intron = tokens [intron_idx ]
354- ichrom , istart , istop , icluster_id = intron .split (":" )
355- # where cluster_signif look
356- # up key = {chr}:{clust_id}
357- _cluster_signif_values = []
358- for v in PARSE_CLUSTER_SIGNIF :
359- clust_signif_key = "{0}:{1}" .format (ichrom , icluster_id )
360- try : parsed_clust_v = cluster_signif_dict [clust_signif_key ][v ]
361- except KeyError : parsed_clust_v = "NA"
362- _cluster_signif_values .append (parsed_clust_v )
401+ intron = parse_intron (tokens , intron_idx ) # returns (chr,intron_start,intron_end,clust_id)
402+ if intron is None : continue # invalid intron format
403+ intron_chrom , intron_start , intron_stop , intron_cluster_id = intron
404+ # Get additional cluster signif info,
405+ # where cluster_signif look up
406+ # first_key = {chr}:{clust_id}
407+ # and second_key is each element in
408+ # PARSE_CLUSTER_SIGNIF
409+ _cluster_signif_values = get_additional_annotation_information (
410+ cluster_signif_dict ,
411+ "{0}:{1}" .format (intron_chrom , intron_cluster_id ),
412+ PARSE_CLUSTER_SIGNIF
413+ )
363414 # Check if cluster meets FDR threshold
364415 try : fdr = float (_cluster_signif_values [ADJ_P_COLUMN_IDX ])
365416 except ValueError : continue # value cannot be type cast, i.e NA
366417 if fdr > float (args .fdr_filter ): continue # does not meet filter
367- # where intron_ann look
368- # up key = {chr}:{intron_start}:{intron_end}:{clust_id}
369- _intron_ann_values = []
370- for v in PARSE_INTRON_ANN :
371- try : parsed_intron_v = intron_ann_dict [intron ][v ]
372- except KeyError : parsed_intron_v = "NA"
373- _intron_ann_values .append (parsed_intron_v )
418+ # Get additional intron info, where
419+ # intron_ann first_key:
420+ # {chr}:{intron_start}:{intron_end}:{clust_id}
421+ # and second_key is each element in
422+ # PARSE_INTRON_ANN
423+ _intron_ann_values = get_additional_annotation_information (
424+ intron_ann_dict ,
425+ ":" .join (intron ),
426+ PARSE_INTRON_ANN
427+ )
374428 # Write annotated line to output
375429 _output_line = "{0}\t {1}\t {2}" .format (
376430 "\t " .join (tokens ),
0 commit comments