|
| 1 | +"""Fisher geometry of dietary composition: when correct math changes the answer. |
| 2 | +
|
| 3 | +Nutritional epidemiology studies diet as proportions of energy from |
| 4 | +macronutrients -- points on the probability simplex. But standard |
| 5 | +analyses treat these proportions as unconstrained Euclidean variables, |
| 6 | +creating spurious correlations (Aitchison's closure problem) and |
| 7 | +distorting apparent dietary associations. |
| 8 | +
|
| 9 | +The Fisher amplitude lift corrects the geometry without breaking at |
| 10 | +zeros -- critical because roughly a third of US adults are habitual |
| 11 | +non-drinkers, producing exact zeros in the alcohol component. |
| 12 | +Log-ratio corrections (Aitchison ILR) cannot handle these zeros without |
| 13 | +pseudocounts that distort the boundary. |
| 14 | +
|
| 15 | +This example generates synthetic dietary compositions resembling NHANES |
| 16 | +macronutrient profiles and demonstrates three results: |
| 17 | + 1. Closure creates large spurious correlations that Fisher invariants |
| 18 | + avoid. |
| 19 | + 2. Dietary concentration (Q_delta) is an independent structural signal |
| 20 | + invisible to raw proportions. |
| 21 | + 3. Fisher geometry preserves the full sample where log-ratio methods |
| 22 | + discard a third of it. |
| 23 | +
|
| 24 | +Demonstrates: forced_coordinates, q_delta, h3, fisher_distance, |
| 25 | + fisher_mean, fisher_pca, shannon_entropy, |
| 26 | + divergence_analysis, distributional_shift, |
| 27 | + pairwise_fisher_distances. |
| 28 | +""" |
| 29 | + |
| 30 | +import numpy as np |
| 31 | + |
| 32 | +import fisher_simplex as fs |
| 33 | + |
| 34 | +print("=== Fisher Geometry of Dietary Composition ===\n") |
| 35 | + |
| 36 | +rng = np.random.default_rng(42) |
| 37 | + |
| 38 | +# ── Generate synthetic dietary compositions ────────────────────────── |
| 39 | +# Four macronutrient energy components on Delta^3: |
| 40 | +# p = (protein, carbohydrate, fat, alcohol) |
| 41 | +# Typical US diet (NHANES 2017-2018): ~16% protein, ~47% carb, ~36% fat, |
| 42 | +# ~3% alcohol. About 31% of adults are habitual non-drinkers. |
| 43 | + |
| 44 | +N_SUBJECTS = 400 |
| 45 | + |
| 46 | +# Three dietary archetypes via asymmetric Dirichlet, calibrated to |
| 47 | +# NHANES reduced-rank-regression quintiles (Shan et al. JAMA 2019): |
| 48 | +# balanced: near population mean (~16% prot, ~47% carb, ~36% fat) |
| 49 | +# high_carb: upper carb quintile (~14% prot, ~59% carb, ~26% fat) |
| 50 | +# high_fat: upper fat quintile (~17% prot, ~38% carb, ~44% fat) |
| 51 | +archetypes = { |
| 52 | + "balanced": (4.0, 12.0, 9.0, 0.7), |
| 53 | + "high_carb": (3.5, 15.0, 6.5, 0.5), |
| 54 | + "high_fat": (4.0, 9.5, 11.0, 0.5), |
| 55 | +} |
| 56 | + |
| 57 | +n_per = N_SUBJECTS // 3 |
| 58 | +remainder = N_SUBJECTS - 3 * n_per |
| 59 | + |
| 60 | +compositions = [] |
| 61 | +labels = [] |
| 62 | +for i, (name, alpha) in enumerate(archetypes.items()): |
| 63 | + m = n_per + (remainder if i == 0 else 0) |
| 64 | + batch = rng.dirichlet(alpha, size=m) |
| 65 | + compositions.append(batch) |
| 66 | + labels.extend([name] * m) |
| 67 | + |
| 68 | +diets = np.vstack(compositions) # shape (N_SUBJECTS, 4) |
| 69 | +labels = np.array(labels) |
| 70 | + |
| 71 | +# Zero out alcohol for ~31% of subjects (habitual non-drinkers), |
| 72 | +# then renormalize. NHANES 2017-2018: ~31% lifetime + past-year |
| 73 | +# abstainers combined. |
| 74 | +non_drinker_mask = rng.random(N_SUBJECTS) < 0.31 |
| 75 | +diets[non_drinker_mask, 3] = 0.0 |
| 76 | +diets = diets / diets.sum(axis=1, keepdims=True) |
| 77 | + |
| 78 | +n_nondrinkers = non_drinker_mask.sum() |
| 79 | +print(f"Subjects: {N_SUBJECTS} (non-drinkers: {n_nondrinkers}, " |
| 80 | + f"{100 * n_nondrinkers / N_SUBJECTS:.0f}%)") |
| 81 | +print("Components: protein, carbohydrate, fat, alcohol (N=4)\n") |
| 82 | + |
| 83 | +# Print archetype means. |
| 84 | +print(f"{'Archetype':12s} {'%prot':>6s} {'%carb':>6s} {'%fat':>6s} {'%alc':>6s}") |
| 85 | +print("-" * 44) |
| 86 | +for name in archetypes: |
| 87 | + mask = labels == name |
| 88 | + m = diets[mask].mean(axis=0) |
| 89 | + print(f"{name:12s} {m[0]:6.1%} {m[1]:6.1%} {m[2]:6.1%} {m[3]:6.1%}") |
| 90 | + |
| 91 | +# ── Closure distortion: spurious Euclidean correlations ────────────── |
| 92 | +# Because proportions sum to 1, increasing %fat FORCES %carb downward. |
| 93 | +# This creates a strong negative correlation that is a mathematical |
| 94 | +# artifact of the constraint, not a biological signal. |
| 95 | + |
| 96 | +R_euclidean = np.corrcoef(diets.T) |
| 97 | + |
| 98 | +print("\n--- Euclidean correlation matrix (raw proportions) ---") |
| 99 | +col_names = ["prot", "carb", "fat ", "alc "] |
| 100 | +print(f"{'':6s} {'prot':>6s} {'carb':>6s} {'fat':>6s} {'alc':>6s}") |
| 101 | +for i, n in enumerate(col_names): |
| 102 | + row = " ".join(f"{R_euclidean[i, j]:+6.3f}" for j in range(4)) |
| 103 | + print(f"{n:6s} {row}") |
| 104 | + |
| 105 | +print("\nNote: fat-carb correlation is strongly negative (closure artifact).") |
| 106 | +print("This is not biology -- it is forced by the sum-to-one constraint.") |
| 107 | + |
| 108 | +# Fisher invariants live on the sphere and are not subject to closure. |
| 109 | +coords = fs.forced_coordinates(diets) # shape (N_SUBJECTS, 2) |
| 110 | +q_vals = coords[:, 0] |
| 111 | +h_vals = coords[:, 1] |
| 112 | + |
| 113 | +# Correlation between Q_delta and H_3 is intrinsic, not an artifact. |
| 114 | +r_qh = np.corrcoef(q_vals, h_vals)[0, 1] |
| 115 | +print(f"\nFisher forced-pair correlation (Q_delta, H_3): {r_qh:+.3f}") |
| 116 | +print("This correlation is geometric (intrinsic), not a closure artifact.") |
| 117 | + |
| 118 | +# ── Dietary concentration: Q_delta as structural signal ────────────── |
| 119 | +# Q_delta measures how dominated the diet is by one or two macros. |
| 120 | +# High Q_delta = nutritional imbalance; low Q_delta = macro balance. |
| 121 | + |
| 122 | +print("\n--- Dietary concentration by archetype ---") |
| 123 | +print(f"{'Archetype':12s} {'Q_delta':>18s} {'H_3':>18s} {'Shannon':>10s}") |
| 124 | +print(f"{'':12s} {'mean +/- std':>18s} {'mean +/- std':>18s} {'mean':>10s}") |
| 125 | +print("-" * 64) |
| 126 | +for name in archetypes: |
| 127 | + mask = labels == name |
| 128 | + q = q_vals[mask] |
| 129 | + h = h_vals[mask] |
| 130 | + ent = fs.shannon_entropy(diets[mask]) |
| 131 | + print(f"{name:12s} {q.mean():.4f} +/- {q.std():.4f}" |
| 132 | + f" {h.mean():+.2e} +/- {h.std():.2e}" |
| 133 | + f" {ent.mean():10.3f}") |
| 134 | + |
| 135 | +print("\nHigh-carb diets have clearly higher Q_delta (single-macro dominance).") |
| 136 | +print("High-fat diets have similar Q_delta to balanced -- shifting fat for") |
| 137 | +print("carb does not increase concentration. Q_delta captures structure") |
| 138 | +print("that raw %fat or %carb alone cannot express.") |
| 139 | + |
| 140 | +# ── Concentration predicts risk better than raw proportions ────────── |
| 141 | +# Synthetic metabolic risk that depends on compositional structure |
| 142 | +# (dietary concentration) rather than any single macronutrient. |
| 143 | + |
| 144 | +noise = rng.normal(0, 0.3, N_SUBJECTS) |
| 145 | +risk_score = 2.0 * q_vals + 5.0 * h_vals + noise |
| 146 | + |
| 147 | +# Compare: correlation of risk with raw %fat vs with Q_delta. |
| 148 | +fat = diets[:, 2] |
| 149 | +r_fat_risk = np.corrcoef(fat, risk_score)[0, 1] |
| 150 | +r_carb_risk = np.corrcoef(diets[:, 1], risk_score)[0, 1] |
| 151 | +r_q_risk = np.corrcoef(q_vals, risk_score)[0, 1] |
| 152 | +r_h_risk = np.corrcoef(h_vals, risk_score)[0, 1] |
| 153 | + |
| 154 | +print("\n--- Correlation with synthetic metabolic risk ---") |
| 155 | +print(f" %fat vs risk: {r_fat_risk:+.3f}") |
| 156 | +print(f" %carb vs risk: {r_carb_risk:+.3f}") |
| 157 | +print(f" Q_delta vs risk: {r_q_risk:+.3f}") |
| 158 | +print(f" H_3 vs risk: {r_h_risk:+.3f}") |
| 159 | + |
| 160 | + |
| 161 | +def _ols_r2(X, y): |
| 162 | + """R-squared from OLS regression of y on columns of X.""" |
| 163 | + X_aug = np.column_stack([np.ones(len(y)), X]) |
| 164 | + beta = np.linalg.lstsq(X_aug, y, rcond=None)[0] |
| 165 | + y_hat = X_aug @ beta |
| 166 | + ss_res = np.sum((y - y_hat) ** 2) |
| 167 | + ss_tot = np.sum((y - y.mean()) ** 2) |
| 168 | + return 1.0 - ss_res / ss_tot |
| 169 | + |
| 170 | + |
| 171 | +r2_raw = _ols_r2(diets[:, :3], risk_score) # %prot, %carb, %fat |
| 172 | +r2_fisher = _ols_r2(coords, risk_score) # Q_delta, H_3 |
| 173 | + |
| 174 | +print(f"\n R^2 (raw %prot, %carb, %fat -> risk): {r2_raw:.3f} (3 predictors)") |
| 175 | +print(f" R^2 (Q_delta, H_3 -> risk): {r2_fisher:.3f} (2 predictors)") |
| 176 | +print("\nFisher invariants explain more variance with fewer predictors.") |
| 177 | + |
| 178 | +# ── Boundary safety: alcohol zeros ─────────────────────────────────── |
| 179 | +# Log-ratio (Aitchison) methods require all components > 0. |
| 180 | +# Subjects with zero alcohol must be excluded or imputed. |
| 181 | +# Fisher geometry handles zeros naturally: sqrt(0) = 0. |
| 182 | + |
| 183 | +drinkers = diets[~non_drinker_mask] |
| 184 | +nondrinkers = diets[non_drinker_mask] |
| 185 | + |
| 186 | +print("\n--- Boundary safety: alcohol zeros ---") |
| 187 | +print(f" Drinkers: {len(drinkers):4d} subjects (Aitchison-eligible)") |
| 188 | +print(f" Non-drinkers: {len(nondrinkers):4d} subjects (excluded by Aitchison)") |
| 189 | +print(f" Full sample: {N_SUBJECTS:4d} subjects (all usable by Fisher)") |
| 190 | + |
| 191 | +# Fisher distances are well-defined for all subjects, including |
| 192 | +# those on the simplex boundary (p_alcohol = 0). |
| 193 | +mean_all = fs.fisher_mean(diets) |
| 194 | +mean_drinkers = fs.fisher_mean(drinkers) |
| 195 | + |
| 196 | +d_means = fs.fisher_distance(mean_all, mean_drinkers) |
| 197 | +print(f"\n Fisher distance, full-sample mean vs drinkers-only: {d_means:.4f}") |
| 198 | + |
| 199 | +# Distributional shift between drinkers and non-drinkers. |
| 200 | +shift = fs.distributional_shift(drinkers, nondrinkers) |
| 201 | +print(" Cloud shift (drinkers vs non-drinkers):") |
| 202 | +print(f" Mean distance: {shift['mean_distance']:.4f}") |
| 203 | +print(f" Cloud distance: {shift['cloud_distance']:.4f}") |
| 204 | +print(f" Ref dispersion: {shift['ref_dispersion']:.4f}") |
| 205 | +print(f" Test dispersion: {shift['test_dispersion']:.4f}") |
| 206 | + |
| 207 | +# Log-ratio on non-drinkers produces -inf. |
| 208 | +with np.errstate(divide="ignore"): |
| 209 | + log_nondrinkers = np.log(nondrinkers) |
| 210 | +n_inf = np.isinf(log_nondrinkers).any(axis=1).sum() |
| 211 | +print(f"\n Log-ratio on non-drinkers: {n_inf}/{len(nondrinkers)} rows contain -inf") |
| 212 | +print(" Fisher lift on non-drinkers: all finite (sqrt(0) = 0)") |
| 213 | + |
| 214 | +# ── Overlap divergence: Phi vs Psi at the boundary ────────────────── |
| 215 | +# Psi = N^N * prod(p_i) collapses to zero whenever any component is |
| 216 | +# zero, making the Phi-Psi gap diagnostic of boundary proximity. |
| 217 | + |
| 218 | +div = fs.divergence_analysis(diets) |
| 219 | +print("\n--- Overlap divergence (Phi vs Psi) ---") |
| 220 | +print(f" Mean |Phi - Psi|: {div['mean_divergence']:.4f}") |
| 221 | +print(f" Max |Phi - Psi|: {div['max_divergence']:.4f}") |
| 222 | +print(f" Fraction consequential: {div['fraction_consequential']:.1%}") |
| 223 | +print(f" Recommendation: {div['recommendation']}") |
| 224 | + |
| 225 | +div_drinkers = fs.divergence_analysis(drinkers) |
| 226 | +div_nondrinkers = fs.divergence_analysis(nondrinkers) |
| 227 | +print(f"\n Drinkers mean divergence: {div_drinkers['mean_divergence']:.4f}") |
| 228 | +print(f" Non-drinkers mean divergence: {div_nondrinkers['mean_divergence']:.4f}") |
| 229 | +print(" Non-drinkers have maximal divergence (Psi = 0 at boundary).") |
| 230 | + |
| 231 | +# ── Dietary pattern discovery: Fisher PCA ──────────────────────────── |
| 232 | +# Tangent PCA at the Fisher mean reveals the principal axes of dietary |
| 233 | +# variation on the sphere -- geometrically meaningful directions. |
| 234 | + |
| 235 | +pca = fs.fisher_pca(diets, n_components=3) |
| 236 | +print("\n--- Fisher tangent PCA ---") |
| 237 | +evr = pca["explained_variance_ratio"] |
| 238 | +print(f" Variance explained: {evr[0]:.1%}, {evr[1]:.1%}, {evr[2]:.1%}") |
| 239 | +print(f" Cumulative: {sum(evr[:2]):.1%} in first two components") |
| 240 | + |
| 241 | +# Compare Fisher vs Euclidean distances between archetype means. |
| 242 | +arch_names = list(archetypes.keys()) |
| 243 | +arch_means = np.array([ |
| 244 | + fs.fisher_mean(diets[labels == name]) for name in arch_names |
| 245 | +]) |
| 246 | + |
| 247 | +D_fisher = fs.pairwise_fisher_distances(arch_means) |
| 248 | +D_euclid = np.zeros((3, 3)) |
| 249 | +for i in range(3): |
| 250 | + for j in range(3): |
| 251 | + D_euclid[i, j] = np.linalg.norm(arch_means[i] - arch_means[j]) |
| 252 | + |
| 253 | +print("\n--- Inter-archetype distances ---") |
| 254 | +print(f"{'Pair':30s} {'Fisher':>8s} {'Euclid':>8s} {'Ratio':>7s}") |
| 255 | +print("-" * 57) |
| 256 | +for i in range(3): |
| 257 | + for j in range(i + 1, 3): |
| 258 | + ratio = D_fisher[i, j] / max(D_euclid[i, j], 1e-12) |
| 259 | + print(f"{arch_names[i]:>14s} - {arch_names[j]:<14s}" |
| 260 | + f" {D_fisher[i, j]:8.4f} {D_euclid[i, j]:8.4f} {ratio:7.2f}") |
| 261 | + |
| 262 | +print("\nFisher distance amplifies separation between dietary archetypes,") |
| 263 | +print("especially those differing near the simplex boundary.") |
| 264 | + |
| 265 | +# ── Fat subtriangle (Delta^2) ──────────────────────────────────────── |
| 266 | +# Decompose total fat into (saturated, monounsaturated, polyunsaturated). |
| 267 | +# NHANES baseline among the three: ~36% SFA, ~38% MUFA, ~26% PUFA |
| 268 | +# (NCI Cancer Trends Progress Report, 2021-2023). |
| 269 | +# On Delta^2, Q_delta and H_3 capture the full invariant structure |
| 270 | +# through degree 6. |
| 271 | + |
| 272 | +fat_alpha = { |
| 273 | + "balanced": (4.7, 4.9, 3.4), |
| 274 | + "high_sat": (12, 4, 2), |
| 275 | + "high_poly": (3, 4, 10), |
| 276 | +} |
| 277 | +fat_comps = [] |
| 278 | +fat_labels = [] |
| 279 | +for name, alpha in fat_alpha.items(): |
| 280 | + batch = rng.dirichlet(alpha, size=100) |
| 281 | + fat_comps.append(batch) |
| 282 | + fat_labels.extend([name] * 100) |
| 283 | +fat_diets = np.vstack(fat_comps) |
| 284 | +fat_labels = np.array(fat_labels) |
| 285 | + |
| 286 | +print("\n--- Fat subtriangle (sat, mono, poly) on Delta^2 ---") |
| 287 | +q_fat = fs.q_delta(fat_diets) |
| 288 | +h_fat = fs.h3(fat_diets) |
| 289 | +print(f"{'Fat profile':12s} {'Q_delta':>12s} {'H_3':>14s}") |
| 290 | +print("-" * 42) |
| 291 | +for name in fat_alpha: |
| 292 | + mask = np.array(fat_labels) == name |
| 293 | + print(f"{name:12s} {q_fat[mask].mean():12.4f} {h_fat[mask].mean():+14.2e}") |
| 294 | + |
| 295 | +print("\nQ_delta separates concentrated fat profiles (high-saturated)") |
| 296 | +print("from balanced ones; H_3 captures asymmetry in the imbalance.") |
| 297 | + |
| 298 | +print("\n--- Dietary composition example complete ---") |
0 commit comments