Skip to content

Commit 9fbdbe0

Browse files
bgeurtenclaude
andcommitted
feat(scripts): build Table 1 (yearly climate, cohort, RRT, yield)
Queries cow.db directly for the Neubau pen cohort and writes a Markdown table to results/broken_stick/01_extract/table1_yearly_climate.md with mean ± SD of barn temperature, relative humidity, THI, RRT, daily milk yield, and number of cows per summer (1 Jun – 30 Sep) plus an Overall row. Climate restricted to Neubau barns (barn_id ∈ {1,2}); RRT filtered to 30–43 °C with milking hours excluded; yield is per-(animal, date) sum of herdeplus_milked_mkg. Reproducible with 'python scripts/build_table1_yearly_climate.py' (~ 4 min, RRT query is the slow step at 8.6 M readings). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent cd191cc commit 9fbdbe0

1 file changed

Lines changed: 328 additions & 0 deletions

File tree

Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
#!/usr/bin/env python3
2+
"""Table 1 — yearly climate × cohort × RRT × milk-yield summary.
3+
4+
For each summer (1 June – 30 September) of 2021–2024 plus an "Overall"
5+
row, compute mean ± SD of:
6+
7+
* Barn temperature (°C) — Neubau sensors only (barn_id ∈ {1, 2})
8+
* Relative humidity (%) — Neubau sensors only
9+
* Temperature–Humidity Index — Neubau sensors only
10+
* Number of cows included (Neubau-pen cohort, ≥ 30 d in groups 1005/1006
11+
during the summer window — same definition as the manuscript cohort)
12+
* Reticulorumen temperature (RRT, °C) — `temp_without_drink_cycles` from
13+
smaxtec_derived, restricted to the Neubau cohort animals × that summer,
14+
filtered to 30–43 °C, milking-hours excluded (04–07, 16–19)
15+
* Daily milk yield (kg/d) — Neubau-cohort animals × that summer
16+
17+
Writes Markdown to ``results/broken_stick/01_extract/table1_yearly_climate.md``.
18+
"""
19+
20+
from __future__ import annotations
21+
22+
import logging
23+
import sqlite3
24+
import sys
25+
from pathlib import Path
26+
27+
import numpy as np
28+
import pandas as pd
29+
30+
logging.basicConfig(level=logging.INFO,
31+
format="%(asctime)s %(levelname)s | %(message)s")
32+
log = logging.getLogger("digimuh.table1")
33+
34+
35+
DB_PATH = Path("/media/geuba03p/GEURTEN01/cow.db")
36+
DATA_DIR = Path("results/broken_stick")
37+
OUT_PATH = DATA_DIR / "01_extract" / "table1_yearly_climate.md"
38+
NEUBAU_GROUPS = (1005, 1006)
39+
NEUBAU_BARNS = (1, 2)
40+
SUMMER_START = "06-01"
41+
SUMMER_END = "09-30"
42+
YEARS = (2021, 2022, 2023, 2024)
43+
44+
45+
# ── Cohort builder (re-uses the manuscript cohort) ───────────────
46+
47+
def neubau_pen_cohort(con: sqlite3.Connection,
48+
min_days: int = 30) -> pd.DataFrame:
49+
"""Per-(animal, year) cohort: ≥ ``min_days`` d in groups 1005/1006
50+
during 1 June – 30 September of each study year."""
51+
qmarks = ",".join("?" * len(NEUBAU_GROUPS))
52+
alloc = pd.read_sql(
53+
f"""SELECT animal_id, "group" AS grp, datetime_enter, datetime_exit
54+
FROM allocations
55+
WHERE "group" IN ({qmarks})
56+
AND datetime_enter <= '2024-09-30'
57+
AND datetime_exit >= '2021-04-01'""",
58+
con, params=list(NEUBAU_GROUPS),
59+
)
60+
alloc["datetime_enter"] = pd.to_datetime(alloc["datetime_enter"])
61+
alloc["datetime_exit"] = pd.to_datetime(alloc["datetime_exit"])
62+
rows: list[dict] = []
63+
for year in YEARS:
64+
s_start = pd.Timestamp(f"{year}-{SUMMER_START}")
65+
s_end = pd.Timestamp(f"{year}-{SUMMER_END}")
66+
for aid, grp in alloc.groupby("animal_id"):
67+
covers = []
68+
for _, r in grp.iterrows():
69+
lo = max(r["datetime_enter"], s_start)
70+
hi = min(r["datetime_exit"], s_end)
71+
if hi > lo:
72+
covers.append((lo, hi))
73+
if not covers:
74+
continue
75+
days = sum((hi - lo).days for lo, hi in covers)
76+
if days < min_days:
77+
continue
78+
enter = min(lo for lo, _ in covers)
79+
exit_ = max(hi for _, hi in covers)
80+
rows.append({
81+
"animal_id": int(aid), "year": int(year),
82+
"datetime_enter": enter, "datetime_exit": exit_,
83+
})
84+
return pd.DataFrame(rows)
85+
86+
87+
# ── Climate per summer (Neubau sensors) ──────────────────────────
88+
89+
def climate_summary(con: sqlite3.Connection) -> dict[int, dict]:
90+
"""Mean ± SD barn temp, RH, THI per summer. Reads at native
91+
sensor resolution from smaxtec_barns, filtered to Neubau."""
92+
qmarks = ",".join("?" * len(NEUBAU_BARNS))
93+
out: dict[int, dict] = {}
94+
all_T, all_H, all_TH = [], [], []
95+
for year in YEARS:
96+
s_start = f"{year}-{SUMMER_START}"
97+
s_end = f"{year}-{SUMMER_END} 23:59:59"
98+
df = pd.read_sql(
99+
f"""SELECT temp, hum, temp_hum_index
100+
FROM smaxtec_barns
101+
WHERE barn_id IN ({qmarks})
102+
AND "timestamp" >= ? AND "timestamp" <= ?
103+
AND temp IS NOT NULL
104+
AND hum IS NOT NULL
105+
AND temp_hum_index IS NOT NULL""",
106+
con, params=[*NEUBAU_BARNS, s_start, s_end],
107+
)
108+
out[year] = {
109+
"barn_T_mean": df["temp"].mean(),
110+
"barn_T_sd": df["temp"].std(),
111+
"barn_H_mean": df["hum"].mean(),
112+
"barn_H_sd": df["hum"].std(),
113+
"barn_THI_mean": df["temp_hum_index"].mean(),
114+
"barn_THI_sd": df["temp_hum_index"].std(),
115+
"n_climate": len(df),
116+
}
117+
all_T.append(df["temp"]); all_H.append(df["hum"]); all_TH.append(df["temp_hum_index"])
118+
overall = {
119+
"barn_T_mean": pd.concat(all_T).mean(),
120+
"barn_T_sd": pd.concat(all_T).std(),
121+
"barn_H_mean": pd.concat(all_H).mean(),
122+
"barn_H_sd": pd.concat(all_H).std(),
123+
"barn_THI_mean": pd.concat(all_TH).mean(),
124+
"barn_THI_sd": pd.concat(all_TH).std(),
125+
"n_climate": sum(len(s) for s in all_T),
126+
}
127+
out["overall"] = overall
128+
return out
129+
130+
131+
# ── RRT per summer (Neubau cohort × milking-hours-excluded) ──────
132+
133+
def rrt_summary(con: sqlite3.Connection,
134+
cohort: pd.DataFrame) -> dict[int, dict]:
135+
"""Mean ± SD reticulorumen temperature per summer, restricted to
136+
the Neubau cohort animals × each summer window. Filtered to
137+
30–43 °C; milking hours excluded (04–07, 16–19) to match the
138+
manuscript pipeline (§3.1 of analysis_00_methods.md). Reads in
139+
chunks to keep memory bounded."""
140+
out: dict[int, dict] = {}
141+
grand_n = 0
142+
grand_sum = 0.0
143+
grand_sumsq = 0.0
144+
for year in YEARS:
145+
ids = cohort.loc[cohort["year"] == year, "animal_id"].astype(int).unique()
146+
if len(ids) == 0:
147+
out[year] = {"rrt_mean": np.nan, "rrt_sd": np.nan, "n_rrt": 0}
148+
continue
149+
s_start = f"{year}-{SUMMER_START}"
150+
s_end = f"{year}-{SUMMER_END} 23:59:59"
151+
qmarks = ",".join("?" * len(ids))
152+
log.info(f" RRT query for {year} ({len(ids)} animals) ...")
153+
df = pd.read_sql(
154+
f"""SELECT "timestamp",
155+
CAST(temp_without_drink_cycles AS REAL) AS rrt
156+
FROM smaxtec_derived
157+
WHERE animal_id IN ({qmarks})
158+
AND "timestamp" >= ? AND "timestamp" <= ?
159+
AND temp_without_drink_cycles IS NOT NULL
160+
AND CAST(temp_without_drink_cycles AS REAL)
161+
BETWEEN 30 AND 43""",
162+
con, params=[*[int(x) for x in ids], s_start, s_end],
163+
)
164+
# Milking-hour exclusion (UTC hours 4-7 and 16-19, matching
165+
# extract_rumen_barn).
166+
ts = pd.to_datetime(df["timestamp"], utc=True)
167+
hr = ts.dt.hour
168+
df = df[~hr.between(4, 7) & ~hr.between(16, 19)]
169+
rrt = df["rrt"].dropna().to_numpy()
170+
out[year] = {
171+
"rrt_mean": float(np.mean(rrt)) if rrt.size else np.nan,
172+
"rrt_sd": float(np.std(rrt, ddof=1)) if rrt.size > 1 else np.nan,
173+
"n_rrt": int(rrt.size),
174+
}
175+
# Online accumulator for overall mean/sd (avoid concatenating
176+
# the year-arrays of millions of rows).
177+
grand_n += rrt.size
178+
grand_sum += float(np.sum(rrt))
179+
grand_sumsq += float(np.sum(rrt ** 2))
180+
if grand_n > 1:
181+
mean_o = grand_sum / grand_n
182+
var_o = (grand_sumsq - grand_n * mean_o ** 2) / (grand_n - 1)
183+
out["overall"] = {"rrt_mean": mean_o,
184+
"rrt_sd": float(np.sqrt(max(var_o, 0.0))),
185+
"n_rrt": grand_n}
186+
else:
187+
out["overall"] = {"rrt_mean": np.nan, "rrt_sd": np.nan, "n_rrt": 0}
188+
return out
189+
190+
191+
# ── Daily milk yield per summer (Neubau cohort) ──────────────────
192+
193+
def yield_summary(con: sqlite3.Connection,
194+
cohort: pd.DataFrame) -> dict[int, dict]:
195+
"""Mean ± SD of per-cow-day daily milk yield per summer. One
196+
record per (animal, date) — daily totals from herdeplus."""
197+
out: dict[int, dict] = {}
198+
all_y: list[pd.Series] = []
199+
for year in YEARS:
200+
ids = cohort.loc[cohort["year"] == year, "animal_id"].astype(int).unique()
201+
if len(ids) == 0:
202+
out[year] = {"y_mean": np.nan, "y_sd": np.nan, "n_y": 0}
203+
continue
204+
s_start = f"{year}-{SUMMER_START}"
205+
s_end = f"{year}-{SUMMER_END} 23:59:59"
206+
qmarks = ",".join("?" * len(ids))
207+
df = pd.read_sql(
208+
f"""SELECT animal_id,
209+
DATE("timestamp") AS date,
210+
SUM(herdeplus_milked_mkg) AS daily_yield_kg
211+
FROM herdeplus
212+
WHERE animal_id IN ({qmarks})
213+
AND "timestamp" >= ? AND "timestamp" <= ?
214+
AND herdeplus_milked_mkg IS NOT NULL
215+
AND herdeplus_milked_mkg > 0
216+
GROUP BY animal_id, DATE("timestamp")""",
217+
con, params=[*[int(x) for x in ids], s_start, s_end],
218+
)
219+
y = df["daily_yield_kg"].dropna()
220+
out[year] = {
221+
"y_mean": float(y.mean()) if not y.empty else np.nan,
222+
"y_sd": float(y.std(ddof=1)) if len(y) > 1 else np.nan,
223+
"n_y": int(len(y)),
224+
}
225+
all_y.append(y)
226+
big = pd.concat(all_y) if all_y else pd.Series(dtype=float)
227+
out["overall"] = {
228+
"y_mean": float(big.mean()) if not big.empty else np.nan,
229+
"y_sd": float(big.std(ddof=1)) if len(big) > 1 else np.nan,
230+
"n_y": int(len(big)),
231+
}
232+
return out
233+
234+
235+
# ── Markdown writer ──────────────────────────────────────────────
236+
237+
def fmt(mean, sd, decimals=1):
238+
if not np.isfinite(mean) or not np.isfinite(sd):
239+
return "—"
240+
return f"{mean:.{decimals}f} ± {sd:.{decimals}f}"
241+
242+
243+
def render_markdown(climate, rrt, yield_, cohort) -> str:
244+
n_cows = {y: int(cohort.loc[cohort.year == y, "animal_id"].nunique())
245+
for y in YEARS}
246+
n_cows["overall"] = int(cohort["animal_id"].nunique())
247+
248+
rows = []
249+
for key in [*YEARS, "overall"]:
250+
c = climate[key]
251+
r = rrt[key]
252+
ye = yield_[key]
253+
label = "Overall" if key == "overall" else str(key)
254+
rows.append({
255+
"Year": label,
256+
"Barn T (°C)": fmt(c["barn_T_mean"], c["barn_T_sd"]),
257+
"RH (%)": fmt(c["barn_H_mean"], c["barn_H_sd"]),
258+
"THI": fmt(c["barn_THI_mean"], c["barn_THI_sd"]),
259+
"n cows": str(n_cows[key]),
260+
"RRT (°C)": fmt(r["rrt_mean"], r["rrt_sd"], 2),
261+
"Daily yield (kg)": fmt(ye["y_mean"], ye["y_sd"]),
262+
})
263+
df = pd.DataFrame(rows)
264+
265+
md = []
266+
md.append("# Table 1 — Yearly climate, cohort size, RRT, and milk yield")
267+
md.append("")
268+
md.append("Summer period 1 June – 30 September each year. Cohort = Neubau pen cohort")
269+
md.append("(animals with ≥ 30 days in groups 1005 or 1006 during the summer window).")
270+
md.append("Climate from smaXtec barn sensors restricted to the Neubau barns")
271+
md.append("(`barn_id ∈ {1, 2}`). RRT from smaxtec_derived `temp_without_drink_cycles`")
272+
md.append("filtered to 30–43 °C with milking hours (04–07, 16–19 UTC) excluded.")
273+
md.append("Daily milk yield is the per-(animal, date) sum of `herdeplus_milked_mkg`.")
274+
md.append("Mean ± SD shown.")
275+
md.append("")
276+
md.append("| Year | Barn temperature (°C) | Relative air humidity (%) | "
277+
"Temperature-humidity index | Number of cows included | "
278+
"RRT (°C) | Daily milk yield (kg) |")
279+
md.append("|---|---|---|---|---:|---|---|")
280+
for _, r in df.iterrows():
281+
md.append(f"| {r['Year']} | {r['Barn T (°C)']} | {r['RH (%)']} | "
282+
f"{r['THI']} | {r['n cows']} | {r['RRT (°C)']} | "
283+
f"{r['Daily yield (kg)']} |")
284+
md.append("")
285+
clim_n = ", ".join(f"{y}: {climate[y]['n_climate']:,}" for y in YEARS)
286+
rrt_n = ", ".join(f"{y}: {rrt[y]['n_rrt']:,}" for y in YEARS)
287+
yld_n = ", ".join(f"{y}: {yield_[y]['n_y']:,}" for y in YEARS)
288+
md.append(
289+
f"*Underlying counts: climate readings n = {clim_n} "
290+
f"(overall {climate['overall']['n_climate']:,}); "
291+
f"RRT readings n = {rrt_n} "
292+
f"(overall {rrt['overall']['n_rrt']:,}); "
293+
f"daily-yield records n = {yld_n} "
294+
f"(overall {yield_['overall']['n_y']:,}).*")
295+
md.append("")
296+
return "\n".join(md)
297+
298+
299+
def main() -> None:
300+
if not DB_PATH.exists():
301+
log.error("Database not found at %s", DB_PATH)
302+
sys.exit(1)
303+
con = sqlite3.connect(DB_PATH)
304+
305+
log.info("Building Neubau cohort ...")
306+
cohort = neubau_pen_cohort(con)
307+
log.info(" cohort: %d cow-years across %d animals",
308+
len(cohort), cohort.animal_id.nunique())
309+
310+
log.info("Aggregating barn climate ...")
311+
clim = climate_summary(con)
312+
313+
log.info("Aggregating reticulorumen temperature ...")
314+
rrt = rrt_summary(con, cohort)
315+
316+
log.info("Aggregating daily milk yield ...")
317+
ye = yield_summary(con, cohort)
318+
319+
md = render_markdown(clim, rrt, ye, cohort)
320+
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
321+
OUT_PATH.write_text(md)
322+
log.info("Wrote %s", OUT_PATH)
323+
print()
324+
print(md)
325+
326+
327+
if __name__ == "__main__":
328+
main()

0 commit comments

Comments
 (0)