Skip to content

Commit 6f1a412

Browse files
committed
Adding script to add extra annotation information to leafcutter results.
1 parent fddea05 commit 6f1a412

1 file changed

Lines changed: 380 additions & 0 deletions

File tree

Lines changed: 380 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,380 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: UTF-8 -*-
3+
# Author: Skyler Kuhn
4+
5+
# Standard Library
6+
from __future__ import print_function
7+
from textwrap import dedent
8+
import argparse, gzip, os, sys
9+
10+
# Constants
11+
# Usage and help section
12+
_HELP = dedent("""
13+
@Usage:
14+
$ ./leafcutter_annotation.py [-h] [--version] \\
15+
[--fdr-filter FDR_FILTER] \\
16+
--effect-sizes EFFECT_SIZES_FILE \\
17+
--cluster-signif CLUSTER_SIGNIF_FILE \\
18+
--intron-ann INTRON_ANN_FILE \\
19+
--exon-ann EXON_ANN_FILE \\
20+
--output OUTPUT_FILE
21+
@About:
22+
Given the differential splicing results from
23+
leafcutter (effect size, cluster signif output
24+
files), an intron annotation file, and an exon
25+
annotation file, this script will collate info
26+
from each source to add the cluster adjusted
27+
p-values, transcript information and exon
28+
information to the intron effect sizes file.
29+
This allow a user to quickly filter the output
30+
from leafcutter and to see what transcripts and
31+
exons are associated with any characterized
32+
differential splicing events.
33+
34+
The results can also be filtered based on an
35+
adjusted p-value threshold, where the default
36+
is set to "0.1".
37+
38+
@Required:
39+
-s, --effect-sizes EFFECT_SIZES_FILE
40+
Input leafcutter effect sizes file.
41+
This file is generated by running
42+
"leafcutter_ds.R".
43+
-c, --cluster-signif CLUSTER_SIGNIF_FILE
44+
Input leafcutter cluster significance
45+
file. This file is generated by
46+
running "leafcutter_ds.R".
47+
-i, --intron-ann INTRON_ANN_FILE
48+
Input intron annotation file. This
49+
file was generated by exporting the
50+
"intron" table from the Rdata file
51+
generated by "prepare_results.R".
52+
-e, --exon-ann EXON_ANN_FILE
53+
Input exon annotation file. This file
54+
was generated by parsing exon info
55+
from the GTF file. It is the output
56+
file of "exon_annotation.py".
57+
-o, --output OUTPUT_FILE
58+
Output file with merged and annotated
59+
leafcutter results.
60+
@Options:
61+
-f, --fdr-filter FDR_FILTER
62+
Adjusted p-value filter. This option
63+
will filter the results to only focus
64+
on differential splicing events with
65+
a cluster significance less than or
66+
equal to this value, default: "0.1".
67+
-h, --help
68+
Shows help message and exits.
69+
-v, --version
70+
Prints the version and exits.
71+
72+
@Example:
73+
$ ./leafcutter_annotation.py \\
74+
-s leafcutter_effect_sizes.txt \\
75+
-c leafcutter_cluster_significance.txt \\
76+
-i intron_annotation.tsv \\
77+
-e exon_annotation.tsv \\
78+
-o leafcutter_annotated_results.tsv \\
79+
-f 0.1
80+
"""
81+
)
82+
83+
# Semantic version
84+
_VERISON = '1.0.0'
85+
86+
87+
# Helper functions
88+
def err(*message, **kwargs):
89+
"""Prints any provided args to standard error.
90+
kwargs can be provided to modify print functions
91+
behavior.
92+
@param message <any>:
93+
Values printed to standard error
94+
@params kwargs <print()>
95+
Key words to modify print function behavior
96+
"""
97+
print(*message, file=sys.stderr, **kwargs)
98+
99+
100+
def fatal(*message, **kwargs):
101+
"""Prints any provided args to standard error
102+
and exits with an exit code of 1.
103+
@param message <any>:
104+
Values printed to standard error
105+
@params kwargs <print()>
106+
Key words to modify print function behavior
107+
"""
108+
err(*message, **kwargs)
109+
sys.exit(1)
110+
111+
112+
def check_permissions(parser, path, *args, **kwargs):
113+
"""Checks permissions using os.access() to see the
114+
user is authorized to access a file/directory. Checks
115+
for existence, read, write and execute via args:
116+
- os.F_OK (tests existence)
117+
- os.R_OK (tests read)
118+
- os.W_OK (tests write)
119+
- os.X_OK (tests exec)
120+
@param parser <argparse.ArgumentParser() object>:
121+
Argparse parser object
122+
@param path <str>:
123+
Name of path to check
124+
@param args <any>:
125+
Positional args to pass to os.access()
126+
@param kwargs <any>:
127+
Named kwargs to pass to os.access()
128+
@return path <str>:
129+
Returns absolute path if it exists and the
130+
checked permssions are setup are correct.
131+
"""
132+
if not os.path.exists(path):
133+
parser.error(
134+
"Path '{}' does not exists! Failed to provide vaild input.".format(path)
135+
)
136+
if not os.access(path, *args, **kwargs):
137+
parser.error(
138+
"Path '{}' exists, but cannot read path due to permissions!".format(path)
139+
)
140+
return os.path.abspath(path)
141+
142+
143+
def parse_cli_arguments():
144+
"""Parses command line arguments and returns
145+
an argparse.parse_args object.
146+
@return <argparse.parse_args()>:
147+
Parsed command line arguments
148+
"""
149+
parser = argparse.ArgumentParser(
150+
add_help=False,
151+
description=_HELP,
152+
formatter_class=argparse.RawDescriptionHelpFormatter,
153+
usage = argparse.SUPPRESS,
154+
)
155+
# Leafcutter effect sizes file
156+
parser.add_argument(
157+
'-s', '--effect-sizes',
158+
type = lambda file: \
159+
check_permissions(parser, file, os.R_OK),
160+
required=True,
161+
help=argparse.SUPPRESS
162+
)
163+
# Leafcutter cluster signif file
164+
parser.add_argument(
165+
'-c', '--cluster-signif',
166+
type = lambda file: \
167+
check_permissions(parser, file, os.R_OK),
168+
required=True,
169+
help=argparse.SUPPRESS
170+
)
171+
# Intron annotation file
172+
parser.add_argument(
173+
'-i', '--intron-ann',
174+
type = lambda file: \
175+
check_permissions(parser, file, os.R_OK),
176+
required=True,
177+
help=argparse.SUPPRESS
178+
)
179+
# Exon annotation file
180+
parser.add_argument(
181+
'-e', '--exon-ann',
182+
type = lambda file: \
183+
check_permissions(parser, file, os.R_OK),
184+
required=True,
185+
help=argparse.SUPPRESS
186+
)
187+
# Annotated output results
188+
parser.add_argument(
189+
'-o', '--output',
190+
type = str,
191+
required=True,
192+
help=argparse.SUPPRESS
193+
)
194+
# FDR filtering threshold
195+
parser.add_argument(
196+
'-f', '--fdr-filter',
197+
type=float, required=False,
198+
help=argparse.SUPPRESS,
199+
default=0.1
200+
)
201+
# Get version information
202+
parser.add_argument(
203+
'-v', '--version',
204+
action='version',
205+
help = argparse.SUPPRESS,
206+
version='%(prog)s {0}'.format(_VERISON)
207+
)
208+
# Add custom help message
209+
parser.add_argument(
210+
'-h', '--help',
211+
action='help',
212+
help=argparse.SUPPRESS
213+
)
214+
return parser.parse_args()
215+
216+
217+
def stripped(s):
218+
"""Cleans string to remove quotes
219+
@param s <str>:
220+
String to remove quotes or clean
221+
@return s <str>:
222+
Cleaned string with quotes removed
223+
"""
224+
return s.strip('"').strip("'")
225+
226+
227+
def index_header(file_header):
228+
"""Returns the index of each column_name
229+
as a dictionary.
230+
@param file_header <str>:
231+
First line of a file, containing column names
232+
@return idx <dict[str]=int>:
233+
Column name to index dictionary
234+
"""
235+
idx = {}
236+
tokens = [
237+
stripped(c.strip()) \
238+
for c in file_header.strip().split('\t')
239+
]
240+
# Create column name to index mapping
241+
for i,c in enumerate(tokens):
242+
idx[c]=i
243+
return idx
244+
245+
246+
def index_file(file, keys, key_delim, values):
247+
"""Parses and indexes a file into a dictionary for quick
248+
lookups later. The file will be indexed on the column names
249+
of the keys provided and the values will be stored as a
250+
nested dictionary. If multiple keys are provided, they
251+
are concatnated into a single string where key_delim
252+
is the delimeter.
253+
@param file <str>:
254+
File to parse and index. Must contain a header with
255+
the columns listed in keys and values. The index of
256+
these columns will be automatically resolved.
257+
@param keys <list[str]>:
258+
List of column names to index the file on. If more
259+
than one key is provide, then a index will be created
260+
by concatenating the keys into a single string where
261+
key_delim is the delimeter.
262+
@param values <list[str]>:
263+
List of column names to associate with each key. These
264+
values will be stored as a nest dictionary so they can
265+
be pulled by their name.
266+
@return file_idx <dict[str]=str>:
267+
Nested dictionary where,
268+
- key = 'key_delim'.join(keys)
269+
- value = {val_col1: "A", val_col2:"B"}
270+
Given,
271+
keys=["A","B"], values["C","D"], key_delim="|"
272+
returns {"A|B": {"C": "c_i", "D": "d_i"}}
273+
"""
274+
file_idx = {}
275+
# Handler for opening files, i.e.
276+
# uncompressed or gzip files
277+
open_func = gzip.open if file.endswith('.gz') else open
278+
line_number = 0 # Used for error reporting
279+
with open_func(file, 'rt') as fh:
280+
header = next(fh)
281+
col_idx = index_header(header)
282+
for line in fh:
283+
# Split the line into columns
284+
tokens = line.strip().split('\t')
285+
# Concatente mutiple keys into
286+
# a single key separated by the
287+
# key_delim character
288+
_k = key_delim.join([tokens[col_idx[k]] for k in keys])
289+
_v = {v: tokens[col_idx[v]] for v in values}
290+
file_idx[_k] = _v
291+
return file_idx
292+
293+
294+
if __name__ == '__main__':
295+
# Parse command line arguments
296+
args = parse_cli_arguments()
297+
298+
# Sanity check for usage
299+
if len(sys.argv) == 1:
300+
# Nothing was provided
301+
fatal('Invalid usage: {0} [-h] ...'.format(os.path.basename(sys.argv[0])))
302+
303+
# Create output directory if
304+
# it does not exist
305+
output_dir = os.path.abspath(os.path.dirname(args.output))
306+
if not os.path.exists(output_dir):
307+
try: os.makedirs(output_dir)
308+
except OSError as e:
309+
fatal(
310+
"Fatal error: Failed to create output directory: {0}\n{1}".format(
311+
output_dir, e
312+
)
313+
)
314+
315+
# Parse cluster_id and adjusted pvalues,
316+
# from leafcutter output file where:
317+
# key = {chr}:{clust_id}
318+
ADJ_P_COLUMN_NAME = "p.adjust"
319+
PARSE_CLUSTER_SIGNIF = ["df", ADJ_P_COLUMN_NAME]
320+
ADJ_P_COLUMN_IDX = PARSE_CLUSTER_SIGNIF.index(ADJ_P_COLUMN_NAME)
321+
cluster_signif_dict = index_file(
322+
args.cluster_signif,
323+
keys=["cluster"],
324+
values=PARSE_CLUSTER_SIGNIF,
325+
key_delim=""
326+
)
327+
328+
# Parse gene, ensembl_id, verdict,
329+
# and transcripts from leafviz intron
330+
# annotation file where:
331+
# key = {chr}:{intron_start}:{intron_end}:{clust_id}
332+
PARSE_INTRON_ANN = ["gene","ensemblID","verdict","transcripts"]
333+
intron_ann_dict = index_file(
334+
args.intron_ann,
335+
keys=["chr","start","end","clusterID"],
336+
values=PARSE_INTRON_ANN,
337+
key_delim=":"
338+
)
339+
340+
# Loop through effect sizes file
341+
# and add more detailed information
342+
ofh = open(args.output, "w")
343+
with open(args.effect_sizes, "r") as ifh:
344+
input_header = next(ifh).rstrip().split("\t") + PARSE_CLUSTER_SIGNIF + PARSE_INTRON_ANN
345+
output_header = "\t".join(input_header)
346+
intron_idx = input_header.index("intron")
347+
ofh.write(output_header + "\n")
348+
for line in ifh:
349+
# Split the line into columns
350+
tokens = line.rstrip().split('\t')
351+
# where intron column format:
352+
# {chr}:{intron_start}:{intron_end}:{clust_id}
353+
intron = tokens[intron_idx]
354+
ichrom, istart, istop, icluster_id = intron.split(":")
355+
# where cluster_signif look
356+
# up key = {chr}:{clust_id}
357+
_cluster_signif_values = []
358+
for v in PARSE_CLUSTER_SIGNIF:
359+
clust_signif_key = "{0}:{1}".format(ichrom, icluster_id)
360+
try: parsed_clust_v = cluster_signif_dict[clust_signif_key][v]
361+
except KeyError: parsed_clust_v = "NA"
362+
_cluster_signif_values.append(parsed_clust_v)
363+
# Check if cluster meets FDR threshold
364+
try: fdr = float(_cluster_signif_values[ADJ_P_COLUMN_IDX])
365+
except ValueError: continue # value cannot be type cast, i.e NA
366+
if fdr > float(args.fdr_filter): continue # does not meet filter
367+
# where intron_ann look
368+
# up key = {chr}:{intron_start}:{intron_end}:{clust_id}
369+
_intron_ann_values = []
370+
for v in PARSE_INTRON_ANN:
371+
try: parsed_intron_v = intron_ann_dict[intron][v]
372+
except KeyError: parsed_intron_v = "NA"
373+
_intron_ann_values.append(parsed_intron_v)
374+
# Write annotated line to output
375+
_output_line = "{0}\t{1}\t{2}".format(
376+
"\t".join(tokens),
377+
"\t".join(_cluster_signif_values),
378+
"\t".join(_intron_ann_values)
379+
)
380+
ofh.write(_output_line + "\n")

0 commit comments

Comments
 (0)