|
| 1 | +""" |
| 2 | +================================================= |
| 3 | +Use parametric statistics for group comparison |
| 4 | +================================================= |
| 5 | +
|
| 6 | +As a contrast to the approach presented in other examples, we also present a |
| 7 | +more "standard" statistical approach to analyze tract profile data. Here, we |
| 8 | +will conduct a point-by-point group comparison using a linear model fit using |
| 9 | +the standard ordinary least squares (OLS) method. |
| 10 | +
|
| 11 | +This example first fetches the ALS classification dataset from Sarica et al |
| 12 | +[1]_. This dataset contains tractometry features from 24 patients with ALS and |
| 13 | +24 demographically matched control subjects. It then uses the statsmodels |
| 14 | +library to compare between the tract profiles of the two groups in one |
| 15 | +tract (the left corticospinal tract) and in one feature (FA). |
| 16 | +
|
| 17 | +.. [1] Alessia Sarica, et al. "The Corticospinal Tract Profile in |
| 18 | + AmyotrophicLateral Sclerosis" Human Brain Mapping, vol. 38, pp. 727-739, 2017 |
| 19 | + DOI: 10.1002/hbm.23412 |
| 20 | +
|
| 21 | +""" |
| 22 | +import matplotlib.pyplot as plt |
| 23 | +import numpy as np |
| 24 | +import pandas as pd |
| 25 | + |
| 26 | +from afqinsight import AFQDataset |
| 27 | +from statsmodels.api import OLS |
| 28 | +from statsmodels.stats.anova import anova_lm |
| 29 | +from statsmodels.stats.multitest import multipletests |
| 30 | + |
| 31 | +from sklearn.impute import SimpleImputer |
| 32 | + |
| 33 | +############################################################################# |
| 34 | +# Fetch data from Sarica et al. |
| 35 | +# ----------------------------- |
| 36 | +# As a shortcut, we have incorporated a few studies into the software. In these |
| 37 | +# cases, a :class:`AFQDataset` class instance can be initialized using the |
| 38 | +# :func:`AFQDataset.from_study` static method. This expects the name of one of |
| 39 | +# the studies that are supported (see the method documentation for the list of |
| 40 | +# these studies). By passing `"sarica"`, we request that the software download |
| 41 | +# the data from this study and initialize an object for us from this data. |
| 42 | + |
| 43 | +afqdata = AFQDataset.from_study("sarica") |
| 44 | + |
| 45 | +# Examine the data |
| 46 | +# ---------------- |
| 47 | +# ``afqdata`` is an ``AFQDataset`` object, with properties corresponding to the |
| 48 | +# tractometry features and phenotypic targets. In this case, y includes only the |
| 49 | +# group to which the subjects belong, encoded as 0 (patients) or 1 (controls). |
| 50 | + |
| 51 | +X = afqdata.X |
| 52 | +y = afqdata.y |
| 53 | +groups = afqdata.groups |
| 54 | +feature_names = afqdata.feature_names |
| 55 | +group_names = afqdata.group_names |
| 56 | +subjects = afqdata.subjects |
| 57 | + |
| 58 | +X = SimpleImputer(strategy="median").fit_transform(X) |
| 59 | + |
| 60 | +# Filtering the data |
| 61 | +# ---------------- |
| 62 | +# We start by filtering the data down to only the FA estimates in the left |
| 63 | +# corticospinal tract. We can do that by creating a Pandas dataframe and then |
| 64 | +# using the column labels that `afqdataset` has generated from the data. |
| 65 | + |
| 66 | +data = pd.DataFrame(columns=afqdata.feature_names, data=X) |
| 67 | +lcst_fa = data.filter(like="Left Corticospinal").filter(like="fa") |
| 68 | + |
| 69 | +pvals = np.zeros(lcst_fa.shape[-1]) |
| 70 | +coefs = np.zeros(lcst_fa.shape[-1]) |
| 71 | +cis = np.zeros((lcst_fa.shape[-1], 2)) |
| 72 | + |
| 73 | +# Fit models |
| 74 | +# ---------------- |
| 75 | +# Next, we fit a model for each node of the CST for the FA as a function of |
| 76 | +# group. The initializer `OLS.from_formula` takes |
| 77 | +# `R-style formulas <https://www.statsmodels.org/dev/example_formulas.html>`_ |
| 78 | +# as its model specification. Here, we are using the `"group"` variable as a |
| 79 | +# categorical variable in the model. Much more complex linear models (including |
| 80 | +# linear mixed effects model) are possible for datasets with more complex |
| 81 | +# phenotypical data. |
| 82 | + |
| 83 | +for ii, column in enumerate(lcst_fa.columns): |
| 84 | + feature, node, tract = column |
| 85 | + this = pd.DataFrame({"group": y, feature: lcst_fa[column]}) |
| 86 | + model = OLS.from_formula(f"{feature} ~ C(group)", this) |
| 87 | + fit = model.fit() |
| 88 | + coefs[ii] = fit.params.loc["C(group)[T.1]"] |
| 89 | + cis[ii] = fit.conf_int(alpha=0.05).loc["C(group)[T.1]"].values |
| 90 | + pvals[ii] = anova_lm(fit, typ=2)["PR(>F)"][0] |
| 91 | + |
| 92 | +# Correct for multiple comparison |
| 93 | +# -------------------------------- |
| 94 | +# Because we conducted 100 comparisons, we need to correct the p-values that |
| 95 | +# we obtained for the potential for a false discovery. There are multiple |
| 96 | +# ways to conduct multuple comparison correction, and we will not get into |
| 97 | +# the considerations in selecting a method here. For the example, we chose the |
| 98 | +# Benjamini/Hochberg FDR controlling method. This returns a boolean array for |
| 99 | +# the p-values that are rejected at a specified alpha level (after correction), |
| 100 | +# as well as an array of the corrected p-values. |
| 101 | + |
| 102 | +reject, pval_corrected, _, _ = multipletests(pvals, alpha=0.05, method="fdr_bh") |
| 103 | +reject_idx = np.where(reject) |
| 104 | + |
| 105 | +# Visualize |
| 106 | +# ---------- |
| 107 | +# Using Matplotlib, we can visualize the results. The top figure shows the tract |
| 108 | +# profiles of the two groups, with stars indicating the nodes at which the null |
| 109 | +# hypothesis is rejected. The bottom figure shows the model coefficients |
| 110 | +# together with their estimated 95% confidence interval. |
| 111 | + |
| 112 | +fig, ax = plt.subplots(2, 1) |
| 113 | +ax[0].plot(np.mean(lcst_fa.iloc[afqdata.y == 0], 0).values) |
| 114 | +ax[0].plot(np.mean(lcst_fa.iloc[afqdata.y == 1], 0).values) |
| 115 | +ax[0].plot(reject_idx, np.zeros(len(reject_idx)), "k*") |
| 116 | + |
| 117 | +ax[1].plot(coefs) |
| 118 | +ax[1].fill_between(range(100), cis[:, 0], cis[:, 1], alpha=0.5) |
| 119 | +ax[1].plot(range(100), np.zeros(100), "k--") |
0 commit comments