Adding script to add extra annotation information to leafcutter results.

skchronicles · skchronicles · commit 6f1a41210f0b · 2025-06-10T17:06:11.000-04:00
diff --git a/workflow/scripts/leafcutter_annotation.py b/workflow/scripts/leafcutter_annotation.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+# Author: Skyler Kuhn
+
+# Standard Library
+from __future__ import print_function
+from textwrap import dedent
+import argparse, gzip, os, sys
+
+# Constants
+# Usage and help section 
+_HELP = dedent("""
+@Usage:
+    $ ./leafcutter_annotation.py [-h] [--version] \\
+            [--fdr-filter FDR_FILTER] \\
+            --effect-sizes EFFECT_SIZES_FILE \\
+            --cluster-signif CLUSTER_SIGNIF_FILE \\
+            --intron-ann INTRON_ANN_FILE \\
+            --exon-ann EXON_ANN_FILE \\
+            --output OUTPUT_FILE
+@About:
+    Given the differential splicing results from
+    leafcutter (effect size, cluster signif output
+    files), an intron annotation file, and an exon
+    annotation file, this script will collate info
+    from each source to add the cluster adjusted
+    p-values, transcript information and exon
+    information to the intron effect sizes file.
+    This allow a user to quickly filter the output
+    from leafcutter and to see what transcripts and
+    exons are associated with any characterized
+    differential splicing events.
+
+    The results can also be filtered based on an
+    adjusted p-value threshold, where the default
+    is set to "0.1".
+
+@Required:
+    -s, --effect-sizes EFFECT_SIZES_FILE
+        Input leafcutter effect sizes file.
+        This file is generated by running
+        "leafcutter_ds.R".
+    -c, --cluster-signif CLUSTER_SIGNIF_FILE
+        Input leafcutter cluster significance
+        file. This file is generated by
+        running "leafcutter_ds.R".
+    -i, --intron-ann INTRON_ANN_FILE
+        Input intron annotation file. This
+        file was generated by exporting the
+        "intron" table from the Rdata file
+        generated by "prepare_results.R".
+    -e, --exon-ann EXON_ANN_FILE
+        Input exon annotation file. This file
+        was generated by parsing exon info
+        from the GTF file. It is the output
+        file of "exon_annotation.py".
+    -o, --output OUTPUT_FILE
+        Output file with merged and annotated
+        leafcutter results.
+@Options:
+    -f, --fdr-filter FDR_FILTER
+        Adjusted p-value filter. This option
+        will filter the results to only focus
+        on differential splicing events with
+        a cluster significance less than or
+        equal to this value, default: "0.1".
+    -h, --help
+        Shows help message and exits.
+    -v, --version
+        Prints the version and exits.
+
+@Example:
+    $ ./leafcutter_annotation.py \\
+        -s leafcutter_effect_sizes.txt \\
+        -c leafcutter_cluster_significance.txt \\
+        -i intron_annotation.tsv \\
+        -e exon_annotation.tsv \\
+        -o leafcutter_annotated_results.tsv \\
+        -f 0.1
+"""
+)
+
+# Semantic version
+_VERISON = '1.0.0'
+
+
+# Helper functions
+def err(*message, **kwargs):
+    """Prints any provided args to standard error.
+    kwargs can be provided to modify print functions
+    behavior.
+    @param message <any>:
+        Values printed to standard error
+    @params kwargs <print()>
+        Key words to modify print function behavior
+    """
+    print(*message, file=sys.stderr, **kwargs)
+
+
+def fatal(*message, **kwargs):
+    """Prints any provided args to standard error
+    and exits with an exit code of 1.
+    @param message <any>:
+        Values printed to standard error
+    @params kwargs <print()>
+        Key words to modify print function behavior
+    """
+    err(*message, **kwargs)
+    sys.exit(1)
+
+
+def check_permissions(parser, path, *args, **kwargs):
+    """Checks permissions using os.access() to see the
+    user is authorized to access a file/directory. Checks
+    for existence, read, write and execute via args:
+        - os.F_OK (tests existence)
+        - os.R_OK (tests read)
+        - os.W_OK (tests write)
+        - os.X_OK (tests exec)
+    @param parser <argparse.ArgumentParser() object>:
+        Argparse parser object
+    @param path <str>:
+        Name of path to check
+    @param args <any>:
+        Positional args to pass to os.access()
+    @param kwargs <any>:
+        Named kwargs to pass to os.access()
+    @return path <str>:
+        Returns absolute path if it exists and the
+        checked permssions are setup are correct.
+    """
+    if not os.path.exists(path):
+        parser.error(
+            "Path '{}' does not exists! Failed to provide vaild input.".format(path)
+        )
+    if not os.access(path, *args, **kwargs):
+        parser.error(
+            "Path '{}' exists, but cannot read path due to permissions!".format(path)
+        )
+    return os.path.abspath(path)
+
+
+def parse_cli_arguments():
+    """Parses command line arguments and returns
+    an argparse.parse_args object.
+    @return <argparse.parse_args()>:
+        Parsed command line arguments
+    """
+    parser = argparse.ArgumentParser(
+        add_help=False,
+        description=_HELP,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage = argparse.SUPPRESS,
+    )
+    # Leafcutter effect sizes file
+    parser.add_argument(
+        '-s', '--effect-sizes',
+        type = lambda file: \
+            check_permissions(parser, file, os.R_OK),
+        required=True,
+        help=argparse.SUPPRESS
+    )
+    # Leafcutter cluster signif file
+    parser.add_argument(
+        '-c', '--cluster-signif',
+        type = lambda file: \
+            check_permissions(parser, file, os.R_OK),
+        required=True,
+        help=argparse.SUPPRESS
+    )
+    # Intron annotation file
+    parser.add_argument(
+        '-i', '--intron-ann',
+        type = lambda file: \
+            check_permissions(parser, file, os.R_OK),
+        required=True,
+        help=argparse.SUPPRESS
+    )
+    # Exon annotation file
+    parser.add_argument(
+        '-e', '--exon-ann',
+        type = lambda file: \
+            check_permissions(parser, file, os.R_OK),
+        required=True,
+        help=argparse.SUPPRESS
+    )
+    # Annotated output results
+    parser.add_argument(
+        '-o', '--output',
+        type = str,
+        required=True,
+        help=argparse.SUPPRESS
+    )
+    # FDR filtering threshold
+    parser.add_argument(
+        '-f', '--fdr-filter',
+        type=float, required=False,
+        help=argparse.SUPPRESS,
+        default=0.1
+    )
+    # Get version information
+    parser.add_argument(
+        '-v', '--version',
+        action='version',
+        help = argparse.SUPPRESS,
+        version='%(prog)s {0}'.format(_VERISON)
+    )
+    # Add custom help message
+    parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help=argparse.SUPPRESS
+    )
+    return parser.parse_args()
+
+
+def stripped(s):
+    """Cleans string to remove quotes
+    @param s <str>:
+        String to remove quotes or clean
+    @return s <str>:
+        Cleaned string with quotes removed
+    """
+    return s.strip('"').strip("'")
+
+
+def index_header(file_header):
+    """Returns the index of each column_name
+    as a dictionary.
+    @param file_header <str>:
+        First line of a file, containing column names
+    @return idx <dict[str]=int>:
+        Column name to index dictionary
+    """
+    idx = {}
+    tokens = [
+        stripped(c.strip()) \
+            for c in file_header.strip().split('\t')
+    ]
+    # Create column name to index mapping
+    for i,c in enumerate(tokens):
+        idx[c]=i
+    return idx
+
+
+def index_file(file, keys, key_delim, values):
+    """Parses and indexes a file into a dictionary for quick
+    lookups later. The file will be indexed on the column names
+    of the keys provided and the values will be stored as a
+    nested dictionary. If multiple keys are provided, they
+    are concatnated into a single string where key_delim
+    is the delimeter.
+    @param file <str>:
+        File to parse and index. Must contain a header with
+        the columns listed in keys and values. The index of
+        these columns will be automatically resolved.  
+    @param keys <list[str]>:
+        List of column names to index the file on. If more
+        than one key is provide, then a index will be created
+        by concatenating the keys into a single string where
+        key_delim is the delimeter.
+    @param values <list[str]>:
+        List of column names to associate with each key. These
+        values will be stored as a nest dictionary so they can
+        be pulled by their name.   
+    @return file_idx <dict[str]=str>:
+        Nested dictionary where,
+            - key = 'key_delim'.join(keys)
+            - value = {val_col1: "A", val_col2:"B"}
+        Given,
+            keys=["A","B"], values["C","D"], key_delim="|"
+            returns {"A|B": {"C": "c_i", "D": "d_i"}}
+    """
+    file_idx = {}
+    # Handler for opening files, i.e.
+    # uncompressed or gzip files
+    open_func = gzip.open if file.endswith('.gz') else open
+    line_number = 0  # Used for error reporting 
+    with open_func(file, 'rt') as fh:
+        header = next(fh)
+        col_idx = index_header(header)
+        for line in fh:
+            # Split the line into columns
+            tokens = line.strip().split('\t')
+            # Concatente mutiple keys into
+            # a single key separated by the
+            # key_delim character
+            _k = key_delim.join([tokens[col_idx[k]] for k in keys])
+            _v = {v: tokens[col_idx[v]]  for v in values}
+            file_idx[_k] = _v
+    return file_idx 
+
+
+if __name__ == '__main__':
+    # Parse command line arguments
+    args = parse_cli_arguments()
+    
+    # Sanity check for usage
+    if len(sys.argv) == 1:
+        # Nothing was provided
+        fatal('Invalid usage: {0} [-h] ...'.format(os.path.basename(sys.argv[0])))
+    
+    # Create output directory if
+    # it does not exist
+    output_dir = os.path.abspath(os.path.dirname(args.output))
+    if not os.path.exists(output_dir):
+        try: os.makedirs(output_dir)
+        except OSError as e: 
+            fatal(
+                "Fatal error: Failed to create output directory: {0}\n{1}".format(
+                    output_dir, e
+                )
+            )
+    
+    # Parse cluster_id and adjusted pvalues,
+    # from leafcutter output file where:
+    #   key = {chr}:{clust_id}
+    ADJ_P_COLUMN_NAME = "p.adjust" 
+    PARSE_CLUSTER_SIGNIF = ["df", ADJ_P_COLUMN_NAME]
+    ADJ_P_COLUMN_IDX = PARSE_CLUSTER_SIGNIF.index(ADJ_P_COLUMN_NAME)
+    cluster_signif_dict = index_file(
+        args.cluster_signif,
+        keys=["cluster"],
+        values=PARSE_CLUSTER_SIGNIF,
+        key_delim=""
+    )
+
+    # Parse gene, ensembl_id, verdict,
+    # and transcripts from leafviz intron
+    # annotation file where:
+    #   key = {chr}:{intron_start}:{intron_end}:{clust_id}
+    PARSE_INTRON_ANN = ["gene","ensemblID","verdict","transcripts"]
+    intron_ann_dict = index_file(
+        args.intron_ann,
+        keys=["chr","start","end","clusterID"],
+        values=PARSE_INTRON_ANN,
+        key_delim=":"
+    )
+
+    # Loop through effect sizes file
+    # and add more detailed information
+    ofh = open(args.output, "w")
+    with open(args.effect_sizes, "r") as ifh:
+        input_header = next(ifh).rstrip().split("\t") + PARSE_CLUSTER_SIGNIF + PARSE_INTRON_ANN
+        output_header = "\t".join(input_header)
+        intron_idx = input_header.index("intron")
+        ofh.write(output_header + "\n")
+        for line in ifh:
+            # Split the line into columns
+            tokens = line.rstrip().split('\t')
+            # where intron column format:
+            # {chr}:{intron_start}:{intron_end}:{clust_id}
+            intron = tokens[intron_idx]
+            ichrom, istart, istop, icluster_id = intron.split(":")
+            # where cluster_signif look
+            # up key = {chr}:{clust_id}
+            _cluster_signif_values = []
+            for v in PARSE_CLUSTER_SIGNIF:
+                clust_signif_key = "{0}:{1}".format(ichrom, icluster_id)
+                try: parsed_clust_v = cluster_signif_dict[clust_signif_key][v]
+                except KeyError: parsed_clust_v = "NA"
+                _cluster_signif_values.append(parsed_clust_v)
+            # Check if cluster meets FDR threshold
+            try: fdr = float(_cluster_signif_values[ADJ_P_COLUMN_IDX])
+            except ValueError: continue                # value cannot be type cast, i.e NA
+            if fdr > float(args.fdr_filter): continue  # does not meet filter
+            # where intron_ann look 
+            # up key = {chr}:{intron_start}:{intron_end}:{clust_id}
+            _intron_ann_values = []
+            for v in PARSE_INTRON_ANN:
+                try: parsed_intron_v = intron_ann_dict[intron][v]
+                except KeyError: parsed_intron_v = "NA"
+                _intron_ann_values.append(parsed_intron_v)
+            # Write annotated line to output
+            _output_line = "{0}\t{1}\t{2}".format(
+                "\t".join(tokens),
+                "\t".join(_cluster_signif_values),
+                "\t".join(_intron_ann_values)
+            )
+            ofh.write(_output_line + "\n")