AstraZeneca-NGS
diff --git a/‎Readme.md‎
Lines changed: 22 additions & 10 deletions b/‎Readme.md‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎build.gradle‎
Lines changed: 2 additions & 1 deletion b/‎build.gradle‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/main/java/com/astrazeneca/vardict/CmdParser.java‎
Lines changed: 3 additions & 0 deletions b/‎src/main/java/com/astrazeneca/vardict/CmdParser.java‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/main/java/com/astrazeneca/vardict/Configuration.java‎
Lines changed: 5 additions & 0 deletions b/‎src/main/java/com/astrazeneca/vardict/Configuration.java‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/main/java/com/astrazeneca/vardict/Utils.java‎
Lines changed: 6 additions & 0 deletions b/‎src/main/java/com/astrazeneca/vardict/Utils.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/main/java/com/astrazeneca/vardict/data/fishertest/FisherExact.java‎
Lines changed: 200 additions & 0 deletions b/‎src/main/java/com/astrazeneca/vardict/data/fishertest/FisherExact.java‎
Lines changed: 200 additions & 0 deletions
@@ -99,18 +99,20 @@ standard Java library because its performance is much higher than that of the st
 
 To run VarDictJava in single sample mode, use a BAM file specified without the `|` symbol and perform Steps 3 and 4 
 (see the Program workflow section) using `teststrandbias.R` and `var2vcf_valid.pl`.
-The following is an example command to run in single sample mode:
+The following is an example command to run in single sample mode with BED file.   
+You have to set options `-c`, `-S`, `-E`, `-g` using number of columns in your BED file for chromosome, start, end
+ and gene of region respectively:
 
 ```
 AF_THR="0.01" # minimum allele frequency
-<path_to_vardict_folder>/build/install/VarDict/bin/VarDict -G /path/to/hg19.fa -f $AF_THR -N sample_name -b /path/to/my.bam -z -c 1 -S 2 -E 3 -g 4 /path/to/my.bed | VarDict/teststrandbias.R | VarDict/var2vcf_valid.pl -N sample_name -E -f $AF_THR
+<path_to_vardict_folder>/build/install/VarDict/bin/VarDict -G /path/to/hg19.fa -f $AF_THR -N sample_name -b /path/to/my.bam -c 1 -S 2 -E 3 -g 4 /path/to/my.bed | VarDict/teststrandbias.R | VarDict/var2vcf_valid.pl -N sample_name -E -f $AF_THR > vars.vcf
 ```
 
 VarDictJava can also be invoked without a BED file if the region is specified in the command line with `-R` option.
 The following is an example command to run VarDictJava for a region (chromosome 7, position from 55270300 to 55270348, EGFR gene) with `-R` option:
 
 ```
-<path_to_vardict_folder>/build/install/VarDict/bin/VarDict  -G /path/to/hg19.fa -f 0.001 -N sample_name -b /path/to/sample.bam  -z -R  chr7:55270300-55270348:EGFR | VarDict/teststrandbias.R | VarDict/var2vcf_valid.pl -N sample_name -E -f 0.001 >vars.vcf
+<path_to_vardict_folder>/build/install/VarDict/bin/VarDict  -G /path/to/hg19.fa -f 0.001 -N sample_name -b /path/to/sample.bam -R  chr7:55270300-55270348:EGFR | VarDict/teststrandbias.R | VarDict/var2vcf_valid.pl -N sample_name -E -f 0.001 > vars.vcf
 ```
 
 In single sample mode, output columns contain a description and statistical info for variants in the single sample. 
@@ -124,7 +126,9 @@ To run paired variant calling, use BAM files specified as `BAM1|BAM2` and perfor
 In this mode, the number of statistics columns in the output is doubled: one set of columns is 
 for the first sample, the other - for second sample.
 
-The following is an example command to run in paired mode:
+The following is an example command to run in paired mode.  
+You have to set options `-c`, `-S`, `-E`, `-g` using number of columns in your bed file for chromosome, start, 
+ end and gene of region respectively:
 
 ```
 AF_THR="0.01" # minimum allele frequency
@@ -360,7 +364,7 @@ These are only rough classification. You need to examine the p-value (after test
 - `-F bit`  
     The hexical to filter reads. Default: `0x504` (filter unmapped reads, 2nd alignments and duplicates).  Use `-F 0` to turn it off.
 - `-z 0/1`       
-    Indicate whether the BED file contains zero-based coordinates, the same way as the Genome browser IGV does.  -z 1 indicates that coordinates in a BED file start from 0. -z 0 indicates that the coordinates start from 1. Default: `1` for a BED file or amplicon BED file.  Use `0` to turn it off. When using `-R` option, it is set to `0`
+    Indicate whether the BED file contains zero-based coordinates, the same way as the Genome browser IGV does.  -z 1 indicates that coordinates in a BED file start from 0. -z 0 indicates that the coordinates start from 1. Default: `1` for a BED file or amplicon BED file (0-based).  Use `0` to turn it off. When using `-R` option, it is set to `0`
 - `-a|--amplicon int:float`    
     Indicate it is amplicon based calling.  Reads that do not map to the amplicon will be skipped.  A read pair is considered to belong to the amplicon if the edges are less than int bp to the amplicon, and overlap fraction is at least float.  Default: `10:0.95`
 - `-k 0/1`   
@@ -485,6 +489,11 @@ These are only rough classification. You need to examine the p-value (after test
     The variant frequency threshold to determine variant as good in case of non-monomer MSI. Default: 0.1 
 - `--mfreq`  
     The variant frequency threshold to determine variant as good in case of monomer MSI. Default: 0.25
+- `--fisher`  
+    EXPERIMENTAL FEATURE: to exclude R script from the VarDict pipeline we added this option to calculate pvalue and oddratio from Fisher Test. 
+    It will decrease time processing on big samples because R script uses slow `textConnection` function.
+   If you use this, do NOT run `teststrandbias.R` or `testsomatic.R` after Vardict, but use `var2vcf_valid.pl`
+    or `var2vcf_paired.pl` after VarDictJava as usual.
 ## Output columns
 ### Simple mode:
 1. Sample - sample name
@@ -599,14 +608,16 @@ Clusters - No. of clusters supporting SV from second sample
 ### Input Files
 
 #### BED File – Regions
-VarDict uses 2 types of BED files for specifying regions of interest: 4-column and 8-column. 
-The 8-column file format is used for targeted DNA deep sequencing analysis (amplicon based calling), 
-the 4-column file format - for single sample analysis.
+VarDict uses 2 types of BED files for specifying regions of interest: 8-column and all others. 
+The 8-column file format is used for targeted DNA deep sequencing analysis (amplicon based calling), amplicon analysis will 
+try to start if BED with 8 columns was provided.
+Otherwise you can start single and paired sample analysis by providing options `-c`, `-S`, `-E`, `-g` 
+with number of columns for chromosome, start, end, gene of the region respectively.
 
 All lines starting with #, browser, and track in a BED file are skipped. 
 The column delimiter can be specified as the `-d` option (the default value is a tab “\t“).
 
-The 8-column file format involves the following data:
+The 8-column amplicon BED file format involves the following data:
 * Chromosome name
 * Region start position
 * Region end position
@@ -616,7 +627,8 @@ The 8-column file format involves the following data:
 * Start position – VarDict starts outputting variants from this position
 * End position – VarDict ends outputting variants from this position
 
-The 4-column file format involves the following data:
+For example 4-column BED file format involves the following data and VarDict must be start with `-c 1 -S 2 -E 3 -g 4` to
+recognize it:
 * Chromosome name
 * Region start position
 * Region end position
 
@@ -4,7 +4,7 @@ plugins {
     id 'jacoco'
 }
 
-version = '1.7.0'
+version = '1.8.0'
 
 repositories { 
     mavenCentral() 
@@ -23,6 +23,7 @@ afterEvaluate {
 
 dependencies {
     compile 'commons-cli:commons-cli:1.2' 
+    compile 'org.apache.commons:commons-math3:3.6.1'
     compile 'com.edropple.jregex:jregex:1.2_01'
     compile('com.github.samtools:htsjdk:2.8.0') {
         transitive = false
 
@@ -166,6 +166,8 @@ private Configuration parseCmd(CommandLine cmd) throws ParseException {
             config.adaptor.addAll(Arrays.asList(cmd.getOptionValue("adaptor").split(",")));
         }
 
+        config.fisher = cmd.hasOption("fisher");
+
         if (cmd.hasOption("DP")) {
             String defaultPrinter = cmd.getOptionValue("DP", PrinterType.OUT.name());
             switch(defaultPrinter) {
@@ -231,6 +233,7 @@ private Options buildOptions() {
         options.addOption("UN", false, "Indicate unique mode, which when mate pairs overlap, the overlapping part will be counted only once using first read only.");
         options.addOption("chimeric", false, "Indicate to turn off chimeric reads filtering.");
         options.addOption("deldupvar", false, "Turn on deleting of duplicate variants. Variants in this mode are considered and outputted only if start position of variant is inside the region interest.");
+        options.addOption("fisher", false, "Experimental feature: Changes R script (teststrandbias.R and testsomatic.) to Java implementation of Fisher exact test.");
         options.addOption("U", "nosv", false, "Turn off structural variant calling.");
 
         options.addOption(OptionBuilder.withArgName("bit")
 
@@ -229,6 +229,11 @@ public class Configuration {
      */
     public boolean deleteDuplicateVariants = false;
 
+    /**
+     * Applying Fisher exact test on forward and reverse counts of variant.
+     */
+    public boolean fisher = false;
+
     /**
      * The minimum distance between two SV clusters in term of read length
      */
 
@@ -102,6 +102,12 @@ public static double roundHalfEven(String pattern, double value) {
         return Double.parseDouble(new DecimalFormat(pattern).format(value));
     }
 
+    public static String getRoundedValueToPrint(String pattern, double value) {
+        return value == Math.round(value)
+                ? new DecimalFormat("0").format(value)
+                : new DecimalFormat(pattern).format(value).replaceAll("0+$", "");
+    }
+
     /**
      * Method creates substring of string begin from specified idx.
      * If idx is negative, it returns substring, counted from the right end of string.
 
@@ -0,0 +1,200 @@
+package com.astrazeneca.vardict.data.fishertest;
+
+import org.apache.commons.math3.distribution.HypergeometricDistribution;
+
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.function.Function;
+
+import static com.astrazeneca.vardict.Utils.roundHalfEven;
+
+/**
+ * EXPERIMENTAL FEATURE.
+ * <p>
+ * Implementation of FisherExact Test as it is implemented in R.
+ * <p>
+ * R implementation of Fisher Test for oddratio uses conditional MLE (maximum likelihood estimation)
+ * that wasn't found in standard libraries for Java.
+ * <p>
+ * Reason to replace R fisher test with this implementation is a slow R `textConnection` function.
+ * In other case we have to use temp files to process VarDict result in R faster, and this is not a good option.
+ */
+
+public class FisherExact {
+    private List<Double> logdc;
+    private int m;
+    private int n;
+    private int k;
+    private int x;
+    private int lo;
+    private int hi;
+    private double PvalueLess;
+    private double PvalueGreater;
+    private double PvalueTwoSided;
+    private List<Integer> support;
+
+    // Seems that Java and R have differences with round half even (JDK-8227248 example, it will round value in memory)
+    public static double RESULT_ROUND_R = 1E5;
+
+    public FisherExact(int refFwd, int refRev, int altFwd, int altRev) {
+        m = refFwd + refRev;
+        n = altFwd + altRev;
+        k = refFwd + altFwd;
+        x = refFwd;
+        lo = Math.max(0, k - n);
+        hi = Math.min(k, m);
+        support = new ArrayList<>();
+        for (int j = lo; j <= hi; j++) {
+            support.add(j);
+        }
+        logdc = logdcDhyper(m, n, k);
+
+        calculatePValue();
+    }
+
+    // Density of the central hypergeometric distribution on its support: store for once as this is needed quite a bit.
+    private List<Double> logdcDhyper(int m, int n, int k) {
+        List<Double> logdc = new ArrayList<>();
+
+        for (int element : support) {
+            if (m + n == 0) {
+                logdc.add(0.0);
+                continue;
+            }
+            // m + n - total number of successes, m - number of successes (reference) k - sample size (forward)
+            HypergeometricDistribution dhyper = new HypergeometricDistribution(m + n, m, k);
+            Double value = dhyper.logProbability(element);
+            if (value.isNaN()) {
+                value = 0.0;
+            }
+            logdc.add(roundHalfEven("0.0000000", value));
+        }
+        return logdc;
+    }
+
+    // Determine the MLE for ncp by solving E(X) = x, where the expectation is with respect to H.
+    // Note that in general the conditional distribution of x given the marginals is a non-central hypergeometric
+    // distribution H with non-centrality parameter ncp, the odds ratio.
+    // The null conditional independence is equivalent to the hypothesis that the odds ratio equals one. `Exact`
+    // inference can be based on observing that in general, given all marginal totals fixed, the first element of the
+    // contingency table has a non-central hypergeometric distribution with non-centrality parameter given by odds
+    // ratio (Fisher, 1935). The alternative for a one-sided test is based on the odds ratio, so alternative =
+    // 'greater' is a test of the odds ratio being bigger than or = 1.
+    private Double mle(double x) {
+        double eps = Math.ulp(1.0);
+        if (x == lo) return 0.0;
+        if (x == hi) return Double.POSITIVE_INFINITY;
+        double mu = mnhyper(1.0);
+        double root;
+        if (mu > x) {
+            Function<Double, Double> f = t -> mnhyper(t) - x;
+            root = UnirootZeroIn.zeroinC(0, 1, f, Math.pow(eps, 0.25));
+        } else if (mu < x) {
+            Function<Double, Double> f = t -> mnhyper(1.0 / t) - x;
+            root = 1.0 / UnirootZeroIn.zeroinC(eps, 1, f, Math.pow(eps, 0.25));
+        } else {
+            root = 1.0;
+        }
+        return root;
+    }
+
+    private Double mnhyper(Double ncp) {
+        if (ncp == 0) return (double) lo;
+        if (ncp.isInfinite()) return (double) hi;
+        else {
+            List<Double> dnhyperResult = dnhyper(ncp);
+            List<Double> multiply = new ArrayList<>();
+            for (int i = 0; i < support.size(); i++) {
+                multiply.add(support.get(i) * dnhyperResult.get(i));
+            }
+            double b = multiply.stream().mapToDouble(a -> a).sum();
+            return b;
+        }
+    }
+
+    private List<Double> dnhyper(Double ncp) {
+        List<Double> result = new ArrayList<>();
+        for (int i = 0; i < support.size(); i++) {
+            result.add(logdc.get(i) + Math.log(ncp) * support.get(i));
+        }
+        double maxResult = Collections.max(result);
+        List<Double> exponentResult = new ArrayList<>();
+
+        for (double el : result) {
+            exponentResult.add(Math.exp(el - maxResult));
+        }
+        result = new ArrayList<>();
+        double sum = exponentResult.stream().mapToDouble(a -> a).sum();
+        for (double element : exponentResult) {
+            result.add(element / sum);
+        }
+        return result;
+    }
+
+    public String getOddRatio() {
+        Double oddRatio = mle(x);
+        if (oddRatio.isInfinite()) {
+            return "Inf";
+        } else if (oddRatio == Math.round(oddRatio)) {
+            return new DecimalFormat("0").format(oddRatio);
+        } else {
+            return String.valueOf(round_as_r(oddRatio));
+        }
+    }
+
+    public double getPValue() {
+        return round_as_r(PvalueTwoSided);
+    }
+
+    public List<Double> getLogdc() {
+        logdc = logdcDhyper(m, n, k);
+        return logdc;
+    }
+
+    public double getPValueGreater() {
+        return round_as_r(PvalueGreater);
+    }
+
+    public double getPValueLess() {
+        return round_as_r(PvalueLess);
+    }
+
+    private double round_as_r(double value) {
+        value = roundHalfEven("0", value * RESULT_ROUND_R);
+        value = value/RESULT_ROUND_R;
+        value = value == 0.0 ? 0 : (value == 1.0 ? 1 : value);
+        return value;
+    }
+
+    private void calculatePValue() {
+        PvalueLess = pnhyper(x, false);
+        PvalueGreater = pnhyper(x, true);
+
+        double relErr = 1 + 1E-7;
+        List<Double> d = dnhyper(1.0);
+        double sum = 0.0;
+        for (Double el : d) {
+            if (el <= d.get(x - lo) * relErr) {
+                sum += el;
+            }
+        }
+        PvalueTwoSided = sum;
+    }
+
+    private double pnhyper(int q, boolean upper_tail) {
+        if (m + n == 0) {
+            return 1.0;
+        }
+        if (upper_tail) {
+            HypergeometricDistribution dhyper = new HypergeometricDistribution(m + n, m, k);
+            return dhyper.upperCumulativeProbability(q);
+        } else {
+            HypergeometricDistribution dhyper = new HypergeometricDistribution(m + n, m, k);
+            return dhyper.cumulativeProbability(q);
+        }
+    }
+}
+
+