|
| 1 | +package com.astrazeneca.vardict.data.fishertest; |
| 2 | + |
| 3 | +import org.apache.commons.math3.distribution.HypergeometricDistribution; |
| 4 | + |
| 5 | +import java.text.DecimalFormat; |
| 6 | +import java.util.ArrayList; |
| 7 | +import java.util.Collections; |
| 8 | +import java.util.List; |
| 9 | +import java.util.function.Function; |
| 10 | + |
| 11 | +import static com.astrazeneca.vardict.Utils.roundHalfEven; |
| 12 | + |
| 13 | +/** |
| 14 | + * EXPERIMENTAL FEATURE. |
| 15 | + * <p> |
| 16 | + * Implementation of FisherExact Test as it is implemented in R. |
| 17 | + * <p> |
| 18 | + * R implementation of Fisher Test for oddratio uses conditional MLE (maximum likelihood estimation) |
| 19 | + * that wasn't found in standard libraries for Java. |
| 20 | + * <p> |
| 21 | + * Reason to replace R fisher test with this implementation is a slow R `textConnection` function. |
| 22 | + * In other case we have to use temp files to process VarDict result in R faster, and this is not a good option. |
| 23 | + */ |
| 24 | + |
| 25 | +public class FisherExact { |
| 26 | + private List<Double> logdc; |
| 27 | + private int m; |
| 28 | + private int n; |
| 29 | + private int k; |
| 30 | + private int x; |
| 31 | + private int lo; |
| 32 | + private int hi; |
| 33 | + private double PvalueLess; |
| 34 | + private double PvalueGreater; |
| 35 | + private double PvalueTwoSided; |
| 36 | + private List<Integer> support; |
| 37 | + |
| 38 | + // Seems that Java and R have differences with round half even (JDK-8227248 example, it will round value in memory) |
| 39 | + public static double RESULT_ROUND_R = 1E5; |
| 40 | + |
| 41 | + public FisherExact(int refFwd, int refRev, int altFwd, int altRev) { |
| 42 | + m = refFwd + refRev; |
| 43 | + n = altFwd + altRev; |
| 44 | + k = refFwd + altFwd; |
| 45 | + x = refFwd; |
| 46 | + lo = Math.max(0, k - n); |
| 47 | + hi = Math.min(k, m); |
| 48 | + support = new ArrayList<>(); |
| 49 | + for (int j = lo; j <= hi; j++) { |
| 50 | + support.add(j); |
| 51 | + } |
| 52 | + logdc = logdcDhyper(m, n, k); |
| 53 | + |
| 54 | + calculatePValue(); |
| 55 | + } |
| 56 | + |
| 57 | + // Density of the central hypergeometric distribution on its support: store for once as this is needed quite a bit. |
| 58 | + private List<Double> logdcDhyper(int m, int n, int k) { |
| 59 | + List<Double> logdc = new ArrayList<>(); |
| 60 | + |
| 61 | + for (int element : support) { |
| 62 | + if (m + n == 0) { |
| 63 | + logdc.add(0.0); |
| 64 | + continue; |
| 65 | + } |
| 66 | + // m + n - total number of successes, m - number of successes (reference) k - sample size (forward) |
| 67 | + HypergeometricDistribution dhyper = new HypergeometricDistribution(m + n, m, k); |
| 68 | + Double value = dhyper.logProbability(element); |
| 69 | + if (value.isNaN()) { |
| 70 | + value = 0.0; |
| 71 | + } |
| 72 | + logdc.add(roundHalfEven("0.0000000", value)); |
| 73 | + } |
| 74 | + return logdc; |
| 75 | + } |
| 76 | + |
| 77 | + // Determine the MLE for ncp by solving E(X) = x, where the expectation is with respect to H. |
| 78 | + // Note that in general the conditional distribution of x given the marginals is a non-central hypergeometric |
| 79 | + // distribution H with non-centrality parameter ncp, the odds ratio. |
| 80 | + // The null conditional independence is equivalent to the hypothesis that the odds ratio equals one. `Exact` |
| 81 | + // inference can be based on observing that in general, given all marginal totals fixed, the first element of the |
| 82 | + // contingency table has a non-central hypergeometric distribution with non-centrality parameter given by odds |
| 83 | + // ratio (Fisher, 1935). The alternative for a one-sided test is based on the odds ratio, so alternative = |
| 84 | + // 'greater' is a test of the odds ratio being bigger than or = 1. |
| 85 | + private Double mle(double x) { |
| 86 | + double eps = Math.ulp(1.0); |
| 87 | + if (x == lo) return 0.0; |
| 88 | + if (x == hi) return Double.POSITIVE_INFINITY; |
| 89 | + double mu = mnhyper(1.0); |
| 90 | + double root; |
| 91 | + if (mu > x) { |
| 92 | + Function<Double, Double> f = t -> mnhyper(t) - x; |
| 93 | + root = UnirootZeroIn.zeroinC(0, 1, f, Math.pow(eps, 0.25)); |
| 94 | + } else if (mu < x) { |
| 95 | + Function<Double, Double> f = t -> mnhyper(1.0 / t) - x; |
| 96 | + root = 1.0 / UnirootZeroIn.zeroinC(eps, 1, f, Math.pow(eps, 0.25)); |
| 97 | + } else { |
| 98 | + root = 1.0; |
| 99 | + } |
| 100 | + return root; |
| 101 | + } |
| 102 | + |
| 103 | + private Double mnhyper(Double ncp) { |
| 104 | + if (ncp == 0) return (double) lo; |
| 105 | + if (ncp.isInfinite()) return (double) hi; |
| 106 | + else { |
| 107 | + List<Double> dnhyperResult = dnhyper(ncp); |
| 108 | + List<Double> multiply = new ArrayList<>(); |
| 109 | + for (int i = 0; i < support.size(); i++) { |
| 110 | + multiply.add(support.get(i) * dnhyperResult.get(i)); |
| 111 | + } |
| 112 | + double b = multiply.stream().mapToDouble(a -> a).sum(); |
| 113 | + return b; |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + private List<Double> dnhyper(Double ncp) { |
| 118 | + List<Double> result = new ArrayList<>(); |
| 119 | + for (int i = 0; i < support.size(); i++) { |
| 120 | + result.add(logdc.get(i) + Math.log(ncp) * support.get(i)); |
| 121 | + } |
| 122 | + double maxResult = Collections.max(result); |
| 123 | + List<Double> exponentResult = new ArrayList<>(); |
| 124 | + |
| 125 | + for (double el : result) { |
| 126 | + exponentResult.add(Math.exp(el - maxResult)); |
| 127 | + } |
| 128 | + result = new ArrayList<>(); |
| 129 | + double sum = exponentResult.stream().mapToDouble(a -> a).sum(); |
| 130 | + for (double element : exponentResult) { |
| 131 | + result.add(element / sum); |
| 132 | + } |
| 133 | + return result; |
| 134 | + } |
| 135 | + |
| 136 | + public String getOddRatio() { |
| 137 | + Double oddRatio = mle(x); |
| 138 | + if (oddRatio.isInfinite()) { |
| 139 | + return "Inf"; |
| 140 | + } else if (oddRatio == Math.round(oddRatio)) { |
| 141 | + return new DecimalFormat("0").format(oddRatio); |
| 142 | + } else { |
| 143 | + return String.valueOf(round_as_r(oddRatio)); |
| 144 | + } |
| 145 | + } |
| 146 | + |
| 147 | + public double getPValue() { |
| 148 | + return round_as_r(PvalueTwoSided); |
| 149 | + } |
| 150 | + |
| 151 | + public List<Double> getLogdc() { |
| 152 | + logdc = logdcDhyper(m, n, k); |
| 153 | + return logdc; |
| 154 | + } |
| 155 | + |
| 156 | + public double getPValueGreater() { |
| 157 | + return round_as_r(PvalueGreater); |
| 158 | + } |
| 159 | + |
| 160 | + public double getPValueLess() { |
| 161 | + return round_as_r(PvalueLess); |
| 162 | + } |
| 163 | + |
| 164 | + private double round_as_r(double value) { |
| 165 | + value = roundHalfEven("0", value * RESULT_ROUND_R); |
| 166 | + value = value/RESULT_ROUND_R; |
| 167 | + value = value == 0.0 ? 0 : (value == 1.0 ? 1 : value); |
| 168 | + return value; |
| 169 | + } |
| 170 | + |
| 171 | + private void calculatePValue() { |
| 172 | + PvalueLess = pnhyper(x, false); |
| 173 | + PvalueGreater = pnhyper(x, true); |
| 174 | + |
| 175 | + double relErr = 1 + 1E-7; |
| 176 | + List<Double> d = dnhyper(1.0); |
| 177 | + double sum = 0.0; |
| 178 | + for (Double el : d) { |
| 179 | + if (el <= d.get(x - lo) * relErr) { |
| 180 | + sum += el; |
| 181 | + } |
| 182 | + } |
| 183 | + PvalueTwoSided = sum; |
| 184 | + } |
| 185 | + |
| 186 | + private double pnhyper(int q, boolean upper_tail) { |
| 187 | + if (m + n == 0) { |
| 188 | + return 1.0; |
| 189 | + } |
| 190 | + if (upper_tail) { |
| 191 | + HypergeometricDistribution dhyper = new HypergeometricDistribution(m + n, m, k); |
| 192 | + return dhyper.upperCumulativeProbability(q); |
| 193 | + } else { |
| 194 | + HypergeometricDistribution dhyper = new HypergeometricDistribution(m + n, m, k); |
| 195 | + return dhyper.cumulativeProbability(q); |
| 196 | + } |
| 197 | + } |
| 198 | +} |
| 199 | + |
| 200 | + |
0 commit comments