|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Table 1 — yearly climate × cohort × RRT × milk-yield summary. |
| 3 | +
|
| 4 | +For each summer (1 June – 30 September) of 2021–2024 plus an "Overall" |
| 5 | +row, compute mean ± SD of: |
| 6 | +
|
| 7 | +* Barn temperature (°C) — Neubau sensors only (barn_id ∈ {1, 2}) |
| 8 | +* Relative humidity (%) — Neubau sensors only |
| 9 | +* Temperature–Humidity Index — Neubau sensors only |
| 10 | +* Number of cows included (Neubau-pen cohort, ≥ 30 d in groups 1005/1006 |
| 11 | + during the summer window — same definition as the manuscript cohort) |
| 12 | +* Reticulorumen temperature (RRT, °C) — `temp_without_drink_cycles` from |
| 13 | + smaxtec_derived, restricted to the Neubau cohort animals × that summer, |
| 14 | + filtered to 30–43 °C, milking-hours excluded (04–07, 16–19) |
| 15 | +* Daily milk yield (kg/d) — Neubau-cohort animals × that summer |
| 16 | +
|
| 17 | +Writes Markdown to ``results/broken_stick/01_extract/table1_yearly_climate.md``. |
| 18 | +""" |
| 19 | + |
| 20 | +from __future__ import annotations |
| 21 | + |
| 22 | +import logging |
| 23 | +import sqlite3 |
| 24 | +import sys |
| 25 | +from pathlib import Path |
| 26 | + |
| 27 | +import numpy as np |
| 28 | +import pandas as pd |
| 29 | + |
| 30 | +logging.basicConfig(level=logging.INFO, |
| 31 | + format="%(asctime)s %(levelname)s | %(message)s") |
| 32 | +log = logging.getLogger("digimuh.table1") |
| 33 | + |
| 34 | + |
| 35 | +DB_PATH = Path("/media/geuba03p/GEURTEN01/cow.db") |
| 36 | +DATA_DIR = Path("results/broken_stick") |
| 37 | +OUT_PATH = DATA_DIR / "01_extract" / "table1_yearly_climate.md" |
| 38 | +NEUBAU_GROUPS = (1005, 1006) |
| 39 | +NEUBAU_BARNS = (1, 2) |
| 40 | +SUMMER_START = "06-01" |
| 41 | +SUMMER_END = "09-30" |
| 42 | +YEARS = (2021, 2022, 2023, 2024) |
| 43 | + |
| 44 | + |
| 45 | +# ── Cohort builder (re-uses the manuscript cohort) ─────────────── |
| 46 | + |
| 47 | +def neubau_pen_cohort(con: sqlite3.Connection, |
| 48 | + min_days: int = 30) -> pd.DataFrame: |
| 49 | + """Per-(animal, year) cohort: ≥ ``min_days`` d in groups 1005/1006 |
| 50 | + during 1 June – 30 September of each study year.""" |
| 51 | + qmarks = ",".join("?" * len(NEUBAU_GROUPS)) |
| 52 | + alloc = pd.read_sql( |
| 53 | + f"""SELECT animal_id, "group" AS grp, datetime_enter, datetime_exit |
| 54 | + FROM allocations |
| 55 | + WHERE "group" IN ({qmarks}) |
| 56 | + AND datetime_enter <= '2024-09-30' |
| 57 | + AND datetime_exit >= '2021-04-01'""", |
| 58 | + con, params=list(NEUBAU_GROUPS), |
| 59 | + ) |
| 60 | + alloc["datetime_enter"] = pd.to_datetime(alloc["datetime_enter"]) |
| 61 | + alloc["datetime_exit"] = pd.to_datetime(alloc["datetime_exit"]) |
| 62 | + rows: list[dict] = [] |
| 63 | + for year in YEARS: |
| 64 | + s_start = pd.Timestamp(f"{year}-{SUMMER_START}") |
| 65 | + s_end = pd.Timestamp(f"{year}-{SUMMER_END}") |
| 66 | + for aid, grp in alloc.groupby("animal_id"): |
| 67 | + covers = [] |
| 68 | + for _, r in grp.iterrows(): |
| 69 | + lo = max(r["datetime_enter"], s_start) |
| 70 | + hi = min(r["datetime_exit"], s_end) |
| 71 | + if hi > lo: |
| 72 | + covers.append((lo, hi)) |
| 73 | + if not covers: |
| 74 | + continue |
| 75 | + days = sum((hi - lo).days for lo, hi in covers) |
| 76 | + if days < min_days: |
| 77 | + continue |
| 78 | + enter = min(lo for lo, _ in covers) |
| 79 | + exit_ = max(hi for _, hi in covers) |
| 80 | + rows.append({ |
| 81 | + "animal_id": int(aid), "year": int(year), |
| 82 | + "datetime_enter": enter, "datetime_exit": exit_, |
| 83 | + }) |
| 84 | + return pd.DataFrame(rows) |
| 85 | + |
| 86 | + |
| 87 | +# ── Climate per summer (Neubau sensors) ────────────────────────── |
| 88 | + |
| 89 | +def climate_summary(con: sqlite3.Connection) -> dict[int, dict]: |
| 90 | + """Mean ± SD barn temp, RH, THI per summer. Reads at native |
| 91 | + sensor resolution from smaxtec_barns, filtered to Neubau.""" |
| 92 | + qmarks = ",".join("?" * len(NEUBAU_BARNS)) |
| 93 | + out: dict[int, dict] = {} |
| 94 | + all_T, all_H, all_TH = [], [], [] |
| 95 | + for year in YEARS: |
| 96 | + s_start = f"{year}-{SUMMER_START}" |
| 97 | + s_end = f"{year}-{SUMMER_END} 23:59:59" |
| 98 | + df = pd.read_sql( |
| 99 | + f"""SELECT temp, hum, temp_hum_index |
| 100 | + FROM smaxtec_barns |
| 101 | + WHERE barn_id IN ({qmarks}) |
| 102 | + AND "timestamp" >= ? AND "timestamp" <= ? |
| 103 | + AND temp IS NOT NULL |
| 104 | + AND hum IS NOT NULL |
| 105 | + AND temp_hum_index IS NOT NULL""", |
| 106 | + con, params=[*NEUBAU_BARNS, s_start, s_end], |
| 107 | + ) |
| 108 | + out[year] = { |
| 109 | + "barn_T_mean": df["temp"].mean(), |
| 110 | + "barn_T_sd": df["temp"].std(), |
| 111 | + "barn_H_mean": df["hum"].mean(), |
| 112 | + "barn_H_sd": df["hum"].std(), |
| 113 | + "barn_THI_mean": df["temp_hum_index"].mean(), |
| 114 | + "barn_THI_sd": df["temp_hum_index"].std(), |
| 115 | + "n_climate": len(df), |
| 116 | + } |
| 117 | + all_T.append(df["temp"]); all_H.append(df["hum"]); all_TH.append(df["temp_hum_index"]) |
| 118 | + overall = { |
| 119 | + "barn_T_mean": pd.concat(all_T).mean(), |
| 120 | + "barn_T_sd": pd.concat(all_T).std(), |
| 121 | + "barn_H_mean": pd.concat(all_H).mean(), |
| 122 | + "barn_H_sd": pd.concat(all_H).std(), |
| 123 | + "barn_THI_mean": pd.concat(all_TH).mean(), |
| 124 | + "barn_THI_sd": pd.concat(all_TH).std(), |
| 125 | + "n_climate": sum(len(s) for s in all_T), |
| 126 | + } |
| 127 | + out["overall"] = overall |
| 128 | + return out |
| 129 | + |
| 130 | + |
| 131 | +# ── RRT per summer (Neubau cohort × milking-hours-excluded) ────── |
| 132 | + |
| 133 | +def rrt_summary(con: sqlite3.Connection, |
| 134 | + cohort: pd.DataFrame) -> dict[int, dict]: |
| 135 | + """Mean ± SD reticulorumen temperature per summer, restricted to |
| 136 | + the Neubau cohort animals × each summer window. Filtered to |
| 137 | + 30–43 °C; milking hours excluded (04–07, 16–19) to match the |
| 138 | + manuscript pipeline (§3.1 of analysis_00_methods.md). Reads in |
| 139 | + chunks to keep memory bounded.""" |
| 140 | + out: dict[int, dict] = {} |
| 141 | + grand_n = 0 |
| 142 | + grand_sum = 0.0 |
| 143 | + grand_sumsq = 0.0 |
| 144 | + for year in YEARS: |
| 145 | + ids = cohort.loc[cohort["year"] == year, "animal_id"].astype(int).unique() |
| 146 | + if len(ids) == 0: |
| 147 | + out[year] = {"rrt_mean": np.nan, "rrt_sd": np.nan, "n_rrt": 0} |
| 148 | + continue |
| 149 | + s_start = f"{year}-{SUMMER_START}" |
| 150 | + s_end = f"{year}-{SUMMER_END} 23:59:59" |
| 151 | + qmarks = ",".join("?" * len(ids)) |
| 152 | + log.info(f" RRT query for {year} ({len(ids)} animals) ...") |
| 153 | + df = pd.read_sql( |
| 154 | + f"""SELECT "timestamp", |
| 155 | + CAST(temp_without_drink_cycles AS REAL) AS rrt |
| 156 | + FROM smaxtec_derived |
| 157 | + WHERE animal_id IN ({qmarks}) |
| 158 | + AND "timestamp" >= ? AND "timestamp" <= ? |
| 159 | + AND temp_without_drink_cycles IS NOT NULL |
| 160 | + AND CAST(temp_without_drink_cycles AS REAL) |
| 161 | + BETWEEN 30 AND 43""", |
| 162 | + con, params=[*[int(x) for x in ids], s_start, s_end], |
| 163 | + ) |
| 164 | + # Milking-hour exclusion (UTC hours 4-7 and 16-19, matching |
| 165 | + # extract_rumen_barn). |
| 166 | + ts = pd.to_datetime(df["timestamp"], utc=True) |
| 167 | + hr = ts.dt.hour |
| 168 | + df = df[~hr.between(4, 7) & ~hr.between(16, 19)] |
| 169 | + rrt = df["rrt"].dropna().to_numpy() |
| 170 | + out[year] = { |
| 171 | + "rrt_mean": float(np.mean(rrt)) if rrt.size else np.nan, |
| 172 | + "rrt_sd": float(np.std(rrt, ddof=1)) if rrt.size > 1 else np.nan, |
| 173 | + "n_rrt": int(rrt.size), |
| 174 | + } |
| 175 | + # Online accumulator for overall mean/sd (avoid concatenating |
| 176 | + # the year-arrays of millions of rows). |
| 177 | + grand_n += rrt.size |
| 178 | + grand_sum += float(np.sum(rrt)) |
| 179 | + grand_sumsq += float(np.sum(rrt ** 2)) |
| 180 | + if grand_n > 1: |
| 181 | + mean_o = grand_sum / grand_n |
| 182 | + var_o = (grand_sumsq - grand_n * mean_o ** 2) / (grand_n - 1) |
| 183 | + out["overall"] = {"rrt_mean": mean_o, |
| 184 | + "rrt_sd": float(np.sqrt(max(var_o, 0.0))), |
| 185 | + "n_rrt": grand_n} |
| 186 | + else: |
| 187 | + out["overall"] = {"rrt_mean": np.nan, "rrt_sd": np.nan, "n_rrt": 0} |
| 188 | + return out |
| 189 | + |
| 190 | + |
| 191 | +# ── Daily milk yield per summer (Neubau cohort) ────────────────── |
| 192 | + |
| 193 | +def yield_summary(con: sqlite3.Connection, |
| 194 | + cohort: pd.DataFrame) -> dict[int, dict]: |
| 195 | + """Mean ± SD of per-cow-day daily milk yield per summer. One |
| 196 | + record per (animal, date) — daily totals from herdeplus.""" |
| 197 | + out: dict[int, dict] = {} |
| 198 | + all_y: list[pd.Series] = [] |
| 199 | + for year in YEARS: |
| 200 | + ids = cohort.loc[cohort["year"] == year, "animal_id"].astype(int).unique() |
| 201 | + if len(ids) == 0: |
| 202 | + out[year] = {"y_mean": np.nan, "y_sd": np.nan, "n_y": 0} |
| 203 | + continue |
| 204 | + s_start = f"{year}-{SUMMER_START}" |
| 205 | + s_end = f"{year}-{SUMMER_END} 23:59:59" |
| 206 | + qmarks = ",".join("?" * len(ids)) |
| 207 | + df = pd.read_sql( |
| 208 | + f"""SELECT animal_id, |
| 209 | + DATE("timestamp") AS date, |
| 210 | + SUM(herdeplus_milked_mkg) AS daily_yield_kg |
| 211 | + FROM herdeplus |
| 212 | + WHERE animal_id IN ({qmarks}) |
| 213 | + AND "timestamp" >= ? AND "timestamp" <= ? |
| 214 | + AND herdeplus_milked_mkg IS NOT NULL |
| 215 | + AND herdeplus_milked_mkg > 0 |
| 216 | + GROUP BY animal_id, DATE("timestamp")""", |
| 217 | + con, params=[*[int(x) for x in ids], s_start, s_end], |
| 218 | + ) |
| 219 | + y = df["daily_yield_kg"].dropna() |
| 220 | + out[year] = { |
| 221 | + "y_mean": float(y.mean()) if not y.empty else np.nan, |
| 222 | + "y_sd": float(y.std(ddof=1)) if len(y) > 1 else np.nan, |
| 223 | + "n_y": int(len(y)), |
| 224 | + } |
| 225 | + all_y.append(y) |
| 226 | + big = pd.concat(all_y) if all_y else pd.Series(dtype=float) |
| 227 | + out["overall"] = { |
| 228 | + "y_mean": float(big.mean()) if not big.empty else np.nan, |
| 229 | + "y_sd": float(big.std(ddof=1)) if len(big) > 1 else np.nan, |
| 230 | + "n_y": int(len(big)), |
| 231 | + } |
| 232 | + return out |
| 233 | + |
| 234 | + |
| 235 | +# ── Markdown writer ────────────────────────────────────────────── |
| 236 | + |
| 237 | +def fmt(mean, sd, decimals=1): |
| 238 | + if not np.isfinite(mean) or not np.isfinite(sd): |
| 239 | + return "—" |
| 240 | + return f"{mean:.{decimals}f} ± {sd:.{decimals}f}" |
| 241 | + |
| 242 | + |
| 243 | +def render_markdown(climate, rrt, yield_, cohort) -> str: |
| 244 | + n_cows = {y: int(cohort.loc[cohort.year == y, "animal_id"].nunique()) |
| 245 | + for y in YEARS} |
| 246 | + n_cows["overall"] = int(cohort["animal_id"].nunique()) |
| 247 | + |
| 248 | + rows = [] |
| 249 | + for key in [*YEARS, "overall"]: |
| 250 | + c = climate[key] |
| 251 | + r = rrt[key] |
| 252 | + ye = yield_[key] |
| 253 | + label = "Overall" if key == "overall" else str(key) |
| 254 | + rows.append({ |
| 255 | + "Year": label, |
| 256 | + "Barn T (°C)": fmt(c["barn_T_mean"], c["barn_T_sd"]), |
| 257 | + "RH (%)": fmt(c["barn_H_mean"], c["barn_H_sd"]), |
| 258 | + "THI": fmt(c["barn_THI_mean"], c["barn_THI_sd"]), |
| 259 | + "n cows": str(n_cows[key]), |
| 260 | + "RRT (°C)": fmt(r["rrt_mean"], r["rrt_sd"], 2), |
| 261 | + "Daily yield (kg)": fmt(ye["y_mean"], ye["y_sd"]), |
| 262 | + }) |
| 263 | + df = pd.DataFrame(rows) |
| 264 | + |
| 265 | + md = [] |
| 266 | + md.append("# Table 1 — Yearly climate, cohort size, RRT, and milk yield") |
| 267 | + md.append("") |
| 268 | + md.append("Summer period 1 June – 30 September each year. Cohort = Neubau pen cohort") |
| 269 | + md.append("(animals with ≥ 30 days in groups 1005 or 1006 during the summer window).") |
| 270 | + md.append("Climate from smaXtec barn sensors restricted to the Neubau barns") |
| 271 | + md.append("(`barn_id ∈ {1, 2}`). RRT from smaxtec_derived `temp_without_drink_cycles`") |
| 272 | + md.append("filtered to 30–43 °C with milking hours (04–07, 16–19 UTC) excluded.") |
| 273 | + md.append("Daily milk yield is the per-(animal, date) sum of `herdeplus_milked_mkg`.") |
| 274 | + md.append("Mean ± SD shown.") |
| 275 | + md.append("") |
| 276 | + md.append("| Year | Barn temperature (°C) | Relative air humidity (%) | " |
| 277 | + "Temperature-humidity index | Number of cows included | " |
| 278 | + "RRT (°C) | Daily milk yield (kg) |") |
| 279 | + md.append("|---|---|---|---|---:|---|---|") |
| 280 | + for _, r in df.iterrows(): |
| 281 | + md.append(f"| {r['Year']} | {r['Barn T (°C)']} | {r['RH (%)']} | " |
| 282 | + f"{r['THI']} | {r['n cows']} | {r['RRT (°C)']} | " |
| 283 | + f"{r['Daily yield (kg)']} |") |
| 284 | + md.append("") |
| 285 | + clim_n = ", ".join(f"{y}: {climate[y]['n_climate']:,}" for y in YEARS) |
| 286 | + rrt_n = ", ".join(f"{y}: {rrt[y]['n_rrt']:,}" for y in YEARS) |
| 287 | + yld_n = ", ".join(f"{y}: {yield_[y]['n_y']:,}" for y in YEARS) |
| 288 | + md.append( |
| 289 | + f"*Underlying counts: climate readings n = {clim_n} " |
| 290 | + f"(overall {climate['overall']['n_climate']:,}); " |
| 291 | + f"RRT readings n = {rrt_n} " |
| 292 | + f"(overall {rrt['overall']['n_rrt']:,}); " |
| 293 | + f"daily-yield records n = {yld_n} " |
| 294 | + f"(overall {yield_['overall']['n_y']:,}).*") |
| 295 | + md.append("") |
| 296 | + return "\n".join(md) |
| 297 | + |
| 298 | + |
| 299 | +def main() -> None: |
| 300 | + if not DB_PATH.exists(): |
| 301 | + log.error("Database not found at %s", DB_PATH) |
| 302 | + sys.exit(1) |
| 303 | + con = sqlite3.connect(DB_PATH) |
| 304 | + |
| 305 | + log.info("Building Neubau cohort ...") |
| 306 | + cohort = neubau_pen_cohort(con) |
| 307 | + log.info(" cohort: %d cow-years across %d animals", |
| 308 | + len(cohort), cohort.animal_id.nunique()) |
| 309 | + |
| 310 | + log.info("Aggregating barn climate ...") |
| 311 | + clim = climate_summary(con) |
| 312 | + |
| 313 | + log.info("Aggregating reticulorumen temperature ...") |
| 314 | + rrt = rrt_summary(con, cohort) |
| 315 | + |
| 316 | + log.info("Aggregating daily milk yield ...") |
| 317 | + ye = yield_summary(con, cohort) |
| 318 | + |
| 319 | + md = render_markdown(clim, rrt, ye, cohort) |
| 320 | + OUT_PATH.parent.mkdir(parents=True, exist_ok=True) |
| 321 | + OUT_PATH.write_text(md) |
| 322 | + log.info("Wrote %s", OUT_PATH) |
| 323 | + print() |
| 324 | + print(md) |
| 325 | + |
| 326 | + |
| 327 | +if __name__ == "__main__": |
| 328 | + main() |
0 commit comments