Skip to content

Commit 30cfc9d

Browse files
committed
Adding scaffolding for transcript sequences script
1 parent e71c066 commit 30cfc9d

1 file changed

Lines changed: 327 additions & 0 deletions

File tree

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: UTF-8 -*-
3+
# Author: Skyler Kuhn
4+
5+
# Standard Library
6+
from __future__ import print_function
7+
from datetime import datetime
8+
from textwrap import dedent
9+
import argparse, gzip, os, sys
10+
11+
# Constants
12+
# Usage and help section
13+
_HELP = dedent("""
14+
@Usage:
15+
$ ./isoform_sequences.py [-h] [--version] \\
16+
[--fdr-filter FDR_FILTER] \\
17+
[--fasta-delim FASTA_DELIM] \\
18+
--isoform-switch-results ISOFORM_SWITCH_RESULTS \\
19+
--splicing-ann SPLICIN_ANN_FILE \\
20+
--transcripts-fasta TRANSCRIPTS_FASTA \\
21+
--output OUTPUT_FILE
22+
@About:
23+
Given an IsoformSwitchAnalyzeR results file,
24+
a splicing annptation file, and a FASTA file
25+
containing transcript sequences, this script
26+
will parse the splicing annotation file and
27+
the FASTA file to generate a new FASTA file
28+
containing the sequences of each transcript
29+
with a significant isoform switch.
30+
This script will not only output the sequence
31+
of the transcript containing the isoform switch,
32+
but also the sequence of all the isoforms that
33+
belong to the same gene as the transcript
34+
containing the isoform switch. This allows
35+
for the user to see the context of the isoform
36+
switch and how it relates to the other isoforms
37+
of the same gene.
38+
39+
@Required:
40+
-i, -isoform-switch-results ISOFORM_SWITCH_RESULTS
41+
Input isoform switch results file. This
42+
file should be a tab-separated file and
43+
contain the following columns:
44+
• isoform_id
45+
• gene_id
46+
• isoform_switch_q_value
47+
-s, --splicing-ann SPLICIN_ANN_FILE
48+
Splicing annotation file. This file should
49+
be a tab-separated file containing the
50+
following columns:
51+
• transcript_id
52+
• gene_id
53+
-t, --transcripts-fasta TRANSCRIPTS_FASTA
54+
FASTA file containing transcript sequences.
55+
This file should contain the sequences
56+
of all transcripts in the splicing
57+
annotation file. The sequences should
58+
be in the FASTA format, where each
59+
transcript is represented by a header
60+
line starting with ">" followed by the
61+
transcript ID, and the sequence on the
62+
next line. The header line should
63+
contain the transcript ID as the first
64+
field, followed by any additional
65+
information separated by the FASTA_DELIM
66+
character. The FASTA_DELIM character
67+
defaults to "|", but can be changed
68+
using the --fasta-delim option.
69+
-o, --output OUTPUT_FILE
70+
Output file with transcript containing
71+
isoform sequences. This file will be
72+
a FASTA file containing the sequences
73+
of the transcripts that contain an
74+
isoform switch. The sequences will be
75+
the sequences of the transcripts that
76+
contain the isoform switch, as well as
77+
the sequences of all the isoforms that
78+
belong to the same gene as the transcript
79+
containing the significant isoform switch.
80+
@Options:
81+
-f, --fdr-filter FDR_THRESHOLD
82+
FDR threshold to use for filtering isoform
83+
switches. This is the q-value threshold
84+
used to determine if an isoform switch is
85+
significant. Default: "0.1".
86+
-d, --fasta-delim FASTA_DELIM
87+
Delimiter character used in the FASTA
88+
file to separate the transcript ID from
89+
any additional information in the header.
90+
Default is "|". This character should not
91+
be present in the transcript ID. This char
92+
is used to parse the transcript ID from
93+
the FASTA header line. The transcript ID
94+
is the first field in the header line,
95+
followed by any additional information
96+
separated by the FASTA_DELIM character.
97+
Default: "|".
98+
-h, --help
99+
Shows help message and exits.
100+
-v, --version
101+
Prints the version and exits.
102+
103+
@Example:
104+
$ ./isoform_sequences.py \\
105+
-i isa_top_isoform_switches.tsv \\
106+
-s splicing_annotation.tsv \\
107+
-t transcripts.fasta \\
108+
-o isa_top_isoform_sequences.fa \\
109+
--fdr-filter 0.1 \\
110+
--fasta-delim "|"
111+
"""
112+
)
113+
114+
# Semantic version
115+
_VERISON = '1.0.0'
116+
117+
118+
# Helper functions
119+
def err(*message, **kwargs):
120+
"""Prints any provided args to standard error.
121+
kwargs can be provided to modify print functions
122+
behavior.
123+
@param message <any>:
124+
Values printed to standard error
125+
@params kwargs <print()>
126+
Key words to modify print function behavior
127+
"""
128+
print(*message, file=sys.stderr, **kwargs)
129+
130+
131+
def fatal(*message, **kwargs):
132+
"""Prints any provided args to standard error
133+
and exits with an exit code of 1.
134+
@param message <any>:
135+
Values printed to standard error
136+
@params kwargs <print()>
137+
Key words to modify print function behavior
138+
"""
139+
err(*message, **kwargs)
140+
sys.exit(1)
141+
142+
143+
def timestamp(format="%Y-%m-%d %H:%M:%S"):
144+
"""Returns a formatted timestamp string
145+
for the current time.
146+
@param format <str>:
147+
Format string for the timestamp, default:
148+
"%Y-%m-%d %H:%M:%S" which is equivalent to
149+
"2023-10-01 12:00:00" for example.
150+
@return <str>:
151+
Formatted timestamp string, i.e. "2023-10-01 12:00:00"
152+
"""
153+
return datetime.now().strftime(format)
154+
155+
156+
def log(*message):
157+
"""Logs a message to standard output with a timestamp.
158+
@param message <any>:
159+
Values printed to log
160+
"""
161+
print("[{0}] {1}".format(
162+
timestamp(),
163+
" ".join([str(m) for m in message]))
164+
)
165+
166+
167+
def check_permissions(parser, path, *args, **kwargs):
168+
"""Checks permissions using os.access() to see the
169+
user is authorized to access a file/directory. Checks
170+
for existence, read, write and execute via args:
171+
• os.F_OK (tests existence)
172+
• os.R_OK (tests read)
173+
• os.W_OK (tests write)
174+
• os.X_OK (tests exec)
175+
@param parser <argparse.ArgumentParser() object>:
176+
Argparse parser object
177+
@param path <str>:
178+
Name of path to check
179+
@param args <any>:
180+
Positional args to pass to os.access()
181+
@param kwargs <any>:
182+
Named kwargs to pass to os.access()
183+
@return path <str>:
184+
Returns absolute path if it exists and the
185+
checked permssions are setup are correct.
186+
"""
187+
if not os.path.exists(path):
188+
parser.error(
189+
"Path '{}' does not exists! Failed to provide vaild input.".format(path)
190+
)
191+
if not os.access(path, *args, **kwargs):
192+
parser.error(
193+
"Path '{}' exists, but cannot read path due to permissions!".format(path)
194+
)
195+
return os.path.abspath(path)
196+
197+
198+
def parse_cli_arguments():
199+
"""Parses command line arguments and returns
200+
an argparse.parse_args object.
201+
@return <argparse.parse_args()>:
202+
Parsed command line arguments
203+
"""
204+
parser = argparse.ArgumentParser(
205+
add_help=False,
206+
description=_HELP,
207+
formatter_class=argparse.RawDescriptionHelpFormatter,
208+
usage = argparse.SUPPRESS,
209+
)
210+
# Isoform switch results file
211+
parser.add_argument(
212+
'-i', '--isoform-switch-results',
213+
type = lambda file: \
214+
check_permissions(parser, file, os.R_OK),
215+
required=True,
216+
help=argparse.SUPPRESS
217+
)
218+
# Splicing annotation file
219+
parser.add_argument(
220+
'-s', '--splicing-ann',
221+
type = lambda file: \
222+
check_permissions(parser, file, os.R_OK),
223+
required=True,
224+
help=argparse.SUPPRESS
225+
)
226+
# Input transcripts sequences FASTA file
227+
parser.add_argument(
228+
'-t', '--transcripts-fasta',
229+
type = lambda file: \
230+
check_permissions(parser, file, os.R_OK),
231+
required=True,
232+
help=argparse.SUPPRESS
233+
)
234+
# Output FASTA file of isoform sequences
235+
parser.add_argument(
236+
'-o', '--output',
237+
type = str,
238+
required=True,
239+
help=argparse.SUPPRESS
240+
)
241+
# FDR filtering threshold
242+
parser.add_argument(
243+
'-f', '--fdr-filter',
244+
type=float,
245+
required=False,
246+
help=argparse.SUPPRESS,
247+
default=0.1
248+
)
249+
# Input transcripts FASTA delimiter,
250+
# used to parse the transcript ID from
251+
# the FASTA header line of the file
252+
# provided to --transcripts-fasta
253+
parser.add_argument(
254+
'-d', '--fasta-delim',
255+
type=str,
256+
required=False,
257+
help=argparse.SUPPRESS,
258+
default="|"
259+
)
260+
# Get version information
261+
parser.add_argument(
262+
'-v', '--version',
263+
action='version',
264+
help = argparse.SUPPRESS,
265+
version='%(prog)s {0}'.format(_VERISON)
266+
)
267+
# Add custom help message
268+
parser.add_argument(
269+
'-h', '--help',
270+
action='help',
271+
help=argparse.SUPPRESS
272+
)
273+
return parser.parse_args()
274+
275+
276+
def stripped(s):
277+
"""Cleans string to remove quotes
278+
@param s <str>:
279+
String to remove quotes or clean
280+
@return s <str>:
281+
Cleaned string with quotes removed
282+
"""
283+
return s.strip('"').strip("'")
284+
285+
286+
def index_header(file_header):
287+
"""Returns the index of each column_name
288+
as a dictionary.
289+
@param file_header <str>:
290+
First line of a file, containing column names
291+
@return idx <dict[str]=int>:
292+
Column name to index dictionary
293+
"""
294+
idx = {}
295+
tokens = [
296+
stripped(c.strip()) \
297+
for c in file_header.strip().split('\t')
298+
]
299+
# Create column name to index mapping
300+
for i,c in enumerate(tokens):
301+
idx[c]=i
302+
return idx
303+
304+
305+
if __name__ == '__main__':
306+
# Parse command line arguments
307+
args = parse_cli_arguments()
308+
309+
# Sanity check for usage
310+
if len(sys.argv) == 1:
311+
# Nothing was provided
312+
fatal('Invalid usage: {0} [-h] ...'.format(os.path.basename(sys.argv[0])))
313+
314+
log("Running isoform sequences script with the following options: ", args)
315+
# Create output directory if
316+
# it does not exist
317+
output_dir = os.path.abspath(os.path.dirname(args.output))
318+
if not os.path.exists(output_dir):
319+
try: os.makedirs(output_dir)
320+
except OSError as e:
321+
fatal(
322+
"Fatal error: Failed to create output directory: {0}\n{1}".format(
323+
output_dir, e
324+
)
325+
)
326+
327+
log("Finished running isoform sequences script!")

0 commit comments

Comments
 (0)