1+ #!/usr/bin/env python3
2+ # -*- coding: UTF-8 -*-
3+ # Author: Skyler Kuhn
4+
5+ # Standard Library
6+ from __future__ import print_function
7+ from datetime import datetime
8+ from textwrap import dedent
9+ import argparse , gzip , os , sys
10+
11+ # Constants
12+ # Usage and help section
13+ _HELP = dedent ("""
14+ @Usage:
15+ $ ./isoform_sequences.py [-h] [--version] \\
16+ [--fdr-filter FDR_FILTER] \\
17+ [--fasta-delim FASTA_DELIM] \\
18+ --isoform-switch-results ISOFORM_SWITCH_RESULTS \\
19+ --splicing-ann SPLICIN_ANN_FILE \\
20+ --transcripts-fasta TRANSCRIPTS_FASTA \\
21+ --output OUTPUT_FILE
22+ @About:
23+ Given an IsoformSwitchAnalyzeR results file,
24+ a splicing annptation file, and a FASTA file
25+ containing transcript sequences, this script
26+ will parse the splicing annotation file and
27+ the FASTA file to generate a new FASTA file
28+ containing the sequences of each transcript
29+ with a significant isoform switch.
30+ This script will not only output the sequence
31+ of the transcript containing the isoform switch,
32+ but also the sequence of all the isoforms that
33+ belong to the same gene as the transcript
34+ containing the isoform switch. This allows
35+ for the user to see the context of the isoform
36+ switch and how it relates to the other isoforms
37+ of the same gene.
38+
39+ @Required:
40+ -i, -isoform-switch-results ISOFORM_SWITCH_RESULTS
41+ Input isoform switch results file. This
42+ file should be a tab-separated file and
43+ contain the following columns:
44+ • isoform_id
45+ • gene_id
46+ • isoform_switch_q_value
47+ -s, --splicing-ann SPLICIN_ANN_FILE
48+ Splicing annotation file. This file should
49+ be a tab-separated file containing the
50+ following columns:
51+ • transcript_id
52+ • gene_id
53+ -t, --transcripts-fasta TRANSCRIPTS_FASTA
54+ FASTA file containing transcript sequences.
55+ This file should contain the sequences
56+ of all transcripts in the splicing
57+ annotation file. The sequences should
58+ be in the FASTA format, where each
59+ transcript is represented by a header
60+ line starting with ">" followed by the
61+ transcript ID, and the sequence on the
62+ next line. The header line should
63+ contain the transcript ID as the first
64+ field, followed by any additional
65+ information separated by the FASTA_DELIM
66+ character. The FASTA_DELIM character
67+ defaults to "|", but can be changed
68+ using the --fasta-delim option.
69+ -o, --output OUTPUT_FILE
70+ Output file with transcript containing
71+ isoform sequences. This file will be
72+ a FASTA file containing the sequences
73+ of the transcripts that contain an
74+ isoform switch. The sequences will be
75+ the sequences of the transcripts that
76+ contain the isoform switch, as well as
77+ the sequences of all the isoforms that
78+ belong to the same gene as the transcript
79+ containing the significant isoform switch.
80+ @Options:
81+ -f, --fdr-filter FDR_THRESHOLD
82+ FDR threshold to use for filtering isoform
83+ switches. This is the q-value threshold
84+ used to determine if an isoform switch is
85+ significant. Default: "0.1".
86+ -d, --fasta-delim FASTA_DELIM
87+ Delimiter character used in the FASTA
88+ file to separate the transcript ID from
89+ any additional information in the header.
90+ Default is "|". This character should not
91+ be present in the transcript ID. This char
92+ is used to parse the transcript ID from
93+ the FASTA header line. The transcript ID
94+ is the first field in the header line,
95+ followed by any additional information
96+ separated by the FASTA_DELIM character.
97+ Default: "|".
98+ -h, --help
99+ Shows help message and exits.
100+ -v, --version
101+ Prints the version and exits.
102+
103+ @Example:
104+ $ ./isoform_sequences.py \\
105+ -i isa_top_isoform_switches.tsv \\
106+ -s splicing_annotation.tsv \\
107+ -t transcripts.fasta \\
108+ -o isa_top_isoform_sequences.fa \\
109+ --fdr-filter 0.1 \\
110+ --fasta-delim "|"
111+ """
112+ )
113+
114+ # Semantic version
115+ _VERISON = '1.0.0'
116+
117+
118+ # Helper functions
119+ def err (* message , ** kwargs ):
120+ """Prints any provided args to standard error.
121+ kwargs can be provided to modify print functions
122+ behavior.
123+ @param message <any>:
124+ Values printed to standard error
125+ @params kwargs <print()>
126+ Key words to modify print function behavior
127+ """
128+ print (* message , file = sys .stderr , ** kwargs )
129+
130+
131+ def fatal (* message , ** kwargs ):
132+ """Prints any provided args to standard error
133+ and exits with an exit code of 1.
134+ @param message <any>:
135+ Values printed to standard error
136+ @params kwargs <print()>
137+ Key words to modify print function behavior
138+ """
139+ err (* message , ** kwargs )
140+ sys .exit (1 )
141+
142+
143+ def timestamp (format = "%Y-%m-%d %H:%M:%S" ):
144+ """Returns a formatted timestamp string
145+ for the current time.
146+ @param format <str>:
147+ Format string for the timestamp, default:
148+ "%Y-%m-%d %H:%M:%S" which is equivalent to
149+ "2023-10-01 12:00:00" for example.
150+ @return <str>:
151+ Formatted timestamp string, i.e. "2023-10-01 12:00:00"
152+ """
153+ return datetime .now ().strftime (format )
154+
155+
156+ def log (* message ):
157+ """Logs a message to standard output with a timestamp.
158+ @param message <any>:
159+ Values printed to log
160+ """
161+ print ("[{0}] {1}" .format (
162+ timestamp (),
163+ " " .join ([str (m ) for m in message ]))
164+ )
165+
166+
167+ def check_permissions (parser , path , * args , ** kwargs ):
168+ """Checks permissions using os.access() to see the
169+ user is authorized to access a file/directory. Checks
170+ for existence, read, write and execute via args:
171+ • os.F_OK (tests existence)
172+ • os.R_OK (tests read)
173+ • os.W_OK (tests write)
174+ • os.X_OK (tests exec)
175+ @param parser <argparse.ArgumentParser() object>:
176+ Argparse parser object
177+ @param path <str>:
178+ Name of path to check
179+ @param args <any>:
180+ Positional args to pass to os.access()
181+ @param kwargs <any>:
182+ Named kwargs to pass to os.access()
183+ @return path <str>:
184+ Returns absolute path if it exists and the
185+ checked permssions are setup are correct.
186+ """
187+ if not os .path .exists (path ):
188+ parser .error (
189+ "Path '{}' does not exists! Failed to provide vaild input." .format (path )
190+ )
191+ if not os .access (path , * args , ** kwargs ):
192+ parser .error (
193+ "Path '{}' exists, but cannot read path due to permissions!" .format (path )
194+ )
195+ return os .path .abspath (path )
196+
197+
198+ def parse_cli_arguments ():
199+ """Parses command line arguments and returns
200+ an argparse.parse_args object.
201+ @return <argparse.parse_args()>:
202+ Parsed command line arguments
203+ """
204+ parser = argparse .ArgumentParser (
205+ add_help = False ,
206+ description = _HELP ,
207+ formatter_class = argparse .RawDescriptionHelpFormatter ,
208+ usage = argparse .SUPPRESS ,
209+ )
210+ # Isoform switch results file
211+ parser .add_argument (
212+ '-i' , '--isoform-switch-results' ,
213+ type = lambda file : \
214+ check_permissions (parser , file , os .R_OK ),
215+ required = True ,
216+ help = argparse .SUPPRESS
217+ )
218+ # Splicing annotation file
219+ parser .add_argument (
220+ '-s' , '--splicing-ann' ,
221+ type = lambda file : \
222+ check_permissions (parser , file , os .R_OK ),
223+ required = True ,
224+ help = argparse .SUPPRESS
225+ )
226+ # Input transcripts sequences FASTA file
227+ parser .add_argument (
228+ '-t' , '--transcripts-fasta' ,
229+ type = lambda file : \
230+ check_permissions (parser , file , os .R_OK ),
231+ required = True ,
232+ help = argparse .SUPPRESS
233+ )
234+ # Output FASTA file of isoform sequences
235+ parser .add_argument (
236+ '-o' , '--output' ,
237+ type = str ,
238+ required = True ,
239+ help = argparse .SUPPRESS
240+ )
241+ # FDR filtering threshold
242+ parser .add_argument (
243+ '-f' , '--fdr-filter' ,
244+ type = float ,
245+ required = False ,
246+ help = argparse .SUPPRESS ,
247+ default = 0.1
248+ )
249+ # Input transcripts FASTA delimiter,
250+ # used to parse the transcript ID from
251+ # the FASTA header line of the file
252+ # provided to --transcripts-fasta
253+ parser .add_argument (
254+ '-d' , '--fasta-delim' ,
255+ type = str ,
256+ required = False ,
257+ help = argparse .SUPPRESS ,
258+ default = "|"
259+ )
260+ # Get version information
261+ parser .add_argument (
262+ '-v' , '--version' ,
263+ action = 'version' ,
264+ help = argparse .SUPPRESS ,
265+ version = '%(prog)s {0}' .format (_VERISON )
266+ )
267+ # Add custom help message
268+ parser .add_argument (
269+ '-h' , '--help' ,
270+ action = 'help' ,
271+ help = argparse .SUPPRESS
272+ )
273+ return parser .parse_args ()
274+
275+
276+ def stripped (s ):
277+ """Cleans string to remove quotes
278+ @param s <str>:
279+ String to remove quotes or clean
280+ @return s <str>:
281+ Cleaned string with quotes removed
282+ """
283+ return s .strip ('"' ).strip ("'" )
284+
285+
286+ def index_header (file_header ):
287+ """Returns the index of each column_name
288+ as a dictionary.
289+ @param file_header <str>:
290+ First line of a file, containing column names
291+ @return idx <dict[str]=int>:
292+ Column name to index dictionary
293+ """
294+ idx = {}
295+ tokens = [
296+ stripped (c .strip ()) \
297+ for c in file_header .strip ().split ('\t ' )
298+ ]
299+ # Create column name to index mapping
300+ for i ,c in enumerate (tokens ):
301+ idx [c ]= i
302+ return idx
303+
304+
305+ if __name__ == '__main__' :
306+ # Parse command line arguments
307+ args = parse_cli_arguments ()
308+
309+ # Sanity check for usage
310+ if len (sys .argv ) == 1 :
311+ # Nothing was provided
312+ fatal ('Invalid usage: {0} [-h] ...' .format (os .path .basename (sys .argv [0 ])))
313+
314+ log ("Running isoform sequences script with the following options: " , args )
315+ # Create output directory if
316+ # it does not exist
317+ output_dir = os .path .abspath (os .path .dirname (args .output ))
318+ if not os .path .exists (output_dir ):
319+ try : os .makedirs (output_dir )
320+ except OSError as e :
321+ fatal (
322+ "Fatal error: Failed to create output directory: {0}\n {1}" .format (
323+ output_dir , e
324+ )
325+ )
326+
327+ log ("Finished running isoform sequences script!" )
0 commit comments