1+ #!/usr/bin/env python3
2+ # -*- coding: UTF-8 -*-
3+ # Author: Skyler Kuhn
4+
5+ # Standard Library
6+ from __future__ import print_function
7+ from textwrap import dedent
8+ import argparse , gzip , os , sys
9+
10+ # Constants
11+ # Usage and help section
12+ _HELP = dedent ("""
13+ @Usage:
14+ $ ./leafcutter_annotation.py [-h] [--version] \\
15+ [--fdr-filter FDR_FILTER] \\
16+ --effect-sizes EFFECT_SIZES_FILE \\
17+ --cluster-signif CLUSTER_SIGNIF_FILE \\
18+ --intron-ann INTRON_ANN_FILE \\
19+ --exon-ann EXON_ANN_FILE \\
20+ --output OUTPUT_FILE
21+ @About:
22+ Given the differential splicing results from
23+ leafcutter (effect size, cluster signif output
24+ files), an intron annotation file, and an exon
25+ annotation file, this script will collate info
26+ from each source to add the cluster adjusted
27+ p-values, transcript information and exon
28+ information to the intron effect sizes file.
29+ This allow a user to quickly filter the output
30+ from leafcutter and to see what transcripts and
31+ exons are associated with any characterized
32+ differential splicing events.
33+
34+ The results can also be filtered based on an
35+ adjusted p-value threshold, where the default
36+ is set to "0.1".
37+
38+ @Required:
39+ -s, --effect-sizes EFFECT_SIZES_FILE
40+ Input leafcutter effect sizes file.
41+ This file is generated by running
42+ "leafcutter_ds.R".
43+ -c, --cluster-signif CLUSTER_SIGNIF_FILE
44+ Input leafcutter cluster significance
45+ file. This file is generated by
46+ running "leafcutter_ds.R".
47+ -i, --intron-ann INTRON_ANN_FILE
48+ Input intron annotation file. This
49+ file was generated by exporting the
50+ "intron" table from the Rdata file
51+ generated by "prepare_results.R".
52+ -e, --exon-ann EXON_ANN_FILE
53+ Input exon annotation file. This file
54+ was generated by parsing exon info
55+ from the GTF file. It is the output
56+ file of "exon_annotation.py".
57+ -o, --output OUTPUT_FILE
58+ Output file with merged and annotated
59+ leafcutter results.
60+ @Options:
61+ -f, --fdr-filter FDR_FILTER
62+ Adjusted p-value filter. This option
63+ will filter the results to only focus
64+ on differential splicing events with
65+ a cluster significance less than or
66+ equal to this value, default: "0.1".
67+ -h, --help
68+ Shows help message and exits.
69+ -v, --version
70+ Prints the version and exits.
71+
72+ @Example:
73+ $ ./leafcutter_annotation.py \\
74+ -s leafcutter_effect_sizes.txt \\
75+ -c leafcutter_cluster_significance.txt \\
76+ -i intron_annotation.tsv \\
77+ -e exon_annotation.tsv \\
78+ -o leafcutter_annotated_results.tsv \\
79+ -f 0.1
80+ """
81+ )
82+
83+ # Semantic version
84+ _VERISON = '1.0.0'
85+
86+
87+ # Helper functions
88+ def err (* message , ** kwargs ):
89+ """Prints any provided args to standard error.
90+ kwargs can be provided to modify print functions
91+ behavior.
92+ @param message <any>:
93+ Values printed to standard error
94+ @params kwargs <print()>
95+ Key words to modify print function behavior
96+ """
97+ print (* message , file = sys .stderr , ** kwargs )
98+
99+
100+ def fatal (* message , ** kwargs ):
101+ """Prints any provided args to standard error
102+ and exits with an exit code of 1.
103+ @param message <any>:
104+ Values printed to standard error
105+ @params kwargs <print()>
106+ Key words to modify print function behavior
107+ """
108+ err (* message , ** kwargs )
109+ sys .exit (1 )
110+
111+
112+ def check_permissions (parser , path , * args , ** kwargs ):
113+ """Checks permissions using os.access() to see the
114+ user is authorized to access a file/directory. Checks
115+ for existence, read, write and execute via args:
116+ - os.F_OK (tests existence)
117+ - os.R_OK (tests read)
118+ - os.W_OK (tests write)
119+ - os.X_OK (tests exec)
120+ @param parser <argparse.ArgumentParser() object>:
121+ Argparse parser object
122+ @param path <str>:
123+ Name of path to check
124+ @param args <any>:
125+ Positional args to pass to os.access()
126+ @param kwargs <any>:
127+ Named kwargs to pass to os.access()
128+ @return path <str>:
129+ Returns absolute path if it exists and the
130+ checked permssions are setup are correct.
131+ """
132+ if not os .path .exists (path ):
133+ parser .error (
134+ "Path '{}' does not exists! Failed to provide vaild input." .format (path )
135+ )
136+ if not os .access (path , * args , ** kwargs ):
137+ parser .error (
138+ "Path '{}' exists, but cannot read path due to permissions!" .format (path )
139+ )
140+ return os .path .abspath (path )
141+
142+
143+ def parse_cli_arguments ():
144+ """Parses command line arguments and returns
145+ an argparse.parse_args object.
146+ @return <argparse.parse_args()>:
147+ Parsed command line arguments
148+ """
149+ parser = argparse .ArgumentParser (
150+ add_help = False ,
151+ description = _HELP ,
152+ formatter_class = argparse .RawDescriptionHelpFormatter ,
153+ usage = argparse .SUPPRESS ,
154+ )
155+ # Leafcutter effect sizes file
156+ parser .add_argument (
157+ '-s' , '--effect-sizes' ,
158+ type = lambda file : \
159+ check_permissions (parser , file , os .R_OK ),
160+ required = True ,
161+ help = argparse .SUPPRESS
162+ )
163+ # Leafcutter cluster signif file
164+ parser .add_argument (
165+ '-c' , '--cluster-signif' ,
166+ type = lambda file : \
167+ check_permissions (parser , file , os .R_OK ),
168+ required = True ,
169+ help = argparse .SUPPRESS
170+ )
171+ # Intron annotation file
172+ parser .add_argument (
173+ '-i' , '--intron-ann' ,
174+ type = lambda file : \
175+ check_permissions (parser , file , os .R_OK ),
176+ required = True ,
177+ help = argparse .SUPPRESS
178+ )
179+ # Exon annotation file
180+ parser .add_argument (
181+ '-e' , '--exon-ann' ,
182+ type = lambda file : \
183+ check_permissions (parser , file , os .R_OK ),
184+ required = True ,
185+ help = argparse .SUPPRESS
186+ )
187+ # Annotated output results
188+ parser .add_argument (
189+ '-o' , '--output' ,
190+ type = str ,
191+ required = True ,
192+ help = argparse .SUPPRESS
193+ )
194+ # FDR filtering threshold
195+ parser .add_argument (
196+ '-f' , '--fdr-filter' ,
197+ type = float , required = False ,
198+ help = argparse .SUPPRESS ,
199+ default = 0.1
200+ )
201+ # Get version information
202+ parser .add_argument (
203+ '-v' , '--version' ,
204+ action = 'version' ,
205+ help = argparse .SUPPRESS ,
206+ version = '%(prog)s {0}' .format (_VERISON )
207+ )
208+ # Add custom help message
209+ parser .add_argument (
210+ '-h' , '--help' ,
211+ action = 'help' ,
212+ help = argparse .SUPPRESS
213+ )
214+ return parser .parse_args ()
215+
216+
217+ def stripped (s ):
218+ """Cleans string to remove quotes
219+ @param s <str>:
220+ String to remove quotes or clean
221+ @return s <str>:
222+ Cleaned string with quotes removed
223+ """
224+ return s .strip ('"' ).strip ("'" )
225+
226+
227+ def index_header (file_header ):
228+ """Returns the index of each column_name
229+ as a dictionary.
230+ @param file_header <str>:
231+ First line of a file, containing column names
232+ @return idx <dict[str]=int>:
233+ Column name to index dictionary
234+ """
235+ idx = {}
236+ tokens = [
237+ stripped (c .strip ()) \
238+ for c in file_header .strip ().split ('\t ' )
239+ ]
240+ # Create column name to index mapping
241+ for i ,c in enumerate (tokens ):
242+ idx [c ]= i
243+ return idx
244+
245+
246+ def index_file (file , keys , key_delim , values ):
247+ """Parses and indexes a file into a dictionary for quick
248+ lookups later. The file will be indexed on the column names
249+ of the keys provided and the values will be stored as a
250+ nested dictionary. If multiple keys are provided, they
251+ are concatnated into a single string where key_delim
252+ is the delimeter.
253+ @param file <str>:
254+ File to parse and index. Must contain a header with
255+ the columns listed in keys and values. The index of
256+ these columns will be automatically resolved.
257+ @param keys <list[str]>:
258+ List of column names to index the file on. If more
259+ than one key is provide, then a index will be created
260+ by concatenating the keys into a single string where
261+ key_delim is the delimeter.
262+ @param values <list[str]>:
263+ List of column names to associate with each key. These
264+ values will be stored as a nest dictionary so they can
265+ be pulled by their name.
266+ @return file_idx <dict[str]=str>:
267+ Nested dictionary where,
268+ - key = 'key_delim'.join(keys)
269+ - value = {val_col1: "A", val_col2:"B"}
270+ Given,
271+ keys=["A","B"], values["C","D"], key_delim="|"
272+ returns {"A|B": {"C": "c_i", "D": "d_i"}}
273+ """
274+ file_idx = {}
275+ # Handler for opening files, i.e.
276+ # uncompressed or gzip files
277+ open_func = gzip .open if file .endswith ('.gz' ) else open
278+ line_number = 0 # Used for error reporting
279+ with open_func (file , 'rt' ) as fh :
280+ header = next (fh )
281+ col_idx = index_header (header )
282+ for line in fh :
283+ # Split the line into columns
284+ tokens = line .strip ().split ('\t ' )
285+ # Concatente mutiple keys into
286+ # a single key separated by the
287+ # key_delim character
288+ _k = key_delim .join ([tokens [col_idx [k ]] for k in keys ])
289+ _v = {v : tokens [col_idx [v ]] for v in values }
290+ file_idx [_k ] = _v
291+ return file_idx
292+
293+
294+ if __name__ == '__main__' :
295+ # Parse command line arguments
296+ args = parse_cli_arguments ()
297+
298+ # Sanity check for usage
299+ if len (sys .argv ) == 1 :
300+ # Nothing was provided
301+ fatal ('Invalid usage: {0} [-h] ...' .format (os .path .basename (sys .argv [0 ])))
302+
303+ # Create output directory if
304+ # it does not exist
305+ output_dir = os .path .abspath (os .path .dirname (args .output ))
306+ if not os .path .exists (output_dir ):
307+ try : os .makedirs (output_dir )
308+ except OSError as e :
309+ fatal (
310+ "Fatal error: Failed to create output directory: {0}\n {1}" .format (
311+ output_dir , e
312+ )
313+ )
314+
315+ # Parse cluster_id and adjusted pvalues,
316+ # from leafcutter output file where:
317+ # key = {chr}:{clust_id}
318+ ADJ_P_COLUMN_NAME = "p.adjust"
319+ PARSE_CLUSTER_SIGNIF = ["df" , ADJ_P_COLUMN_NAME ]
320+ ADJ_P_COLUMN_IDX = PARSE_CLUSTER_SIGNIF .index (ADJ_P_COLUMN_NAME )
321+ cluster_signif_dict = index_file (
322+ args .cluster_signif ,
323+ keys = ["cluster" ],
324+ values = PARSE_CLUSTER_SIGNIF ,
325+ key_delim = ""
326+ )
327+
328+ # Parse gene, ensembl_id, verdict,
329+ # and transcripts from leafviz intron
330+ # annotation file where:
331+ # key = {chr}:{intron_start}:{intron_end}:{clust_id}
332+ PARSE_INTRON_ANN = ["gene" ,"ensemblID" ,"verdict" ,"transcripts" ]
333+ intron_ann_dict = index_file (
334+ args .intron_ann ,
335+ keys = ["chr" ,"start" ,"end" ,"clusterID" ],
336+ values = PARSE_INTRON_ANN ,
337+ key_delim = ":"
338+ )
339+
340+ # Loop through effect sizes file
341+ # and add more detailed information
342+ ofh = open (args .output , "w" )
343+ with open (args .effect_sizes , "r" ) as ifh :
344+ input_header = next (ifh ).rstrip ().split ("\t " ) + PARSE_CLUSTER_SIGNIF + PARSE_INTRON_ANN
345+ output_header = "\t " .join (input_header )
346+ intron_idx = input_header .index ("intron" )
347+ ofh .write (output_header + "\n " )
348+ for line in ifh :
349+ # Split the line into columns
350+ tokens = line .rstrip ().split ('\t ' )
351+ # where intron column format:
352+ # {chr}:{intron_start}:{intron_end}:{clust_id}
353+ intron = tokens [intron_idx ]
354+ ichrom , istart , istop , icluster_id = intron .split (":" )
355+ # where cluster_signif look
356+ # up key = {chr}:{clust_id}
357+ _cluster_signif_values = []
358+ for v in PARSE_CLUSTER_SIGNIF :
359+ clust_signif_key = "{0}:{1}" .format (ichrom , icluster_id )
360+ try : parsed_clust_v = cluster_signif_dict [clust_signif_key ][v ]
361+ except KeyError : parsed_clust_v = "NA"
362+ _cluster_signif_values .append (parsed_clust_v )
363+ # Check if cluster meets FDR threshold
364+ try : fdr = float (_cluster_signif_values [ADJ_P_COLUMN_IDX ])
365+ except ValueError : continue # value cannot be type cast, i.e NA
366+ if fdr > float (args .fdr_filter ): continue # does not meet filter
367+ # where intron_ann look
368+ # up key = {chr}:{intron_start}:{intron_end}:{clust_id}
369+ _intron_ann_values = []
370+ for v in PARSE_INTRON_ANN :
371+ try : parsed_intron_v = intron_ann_dict [intron ][v ]
372+ except KeyError : parsed_intron_v = "NA"
373+ _intron_ann_values .append (parsed_intron_v )
374+ # Write annotated line to output
375+ _output_line = "{0}\t {1}\t {2}" .format (
376+ "\t " .join (tokens ),
377+ "\t " .join (_cluster_signif_values ),
378+ "\t " .join (_intron_ann_values )
379+ )
380+ ofh .write (_output_line + "\n " )
0 commit comments