Skip to content

Commit f5f12bc

Browse files
committed
Workflow and analysis to regenerate tRNA synthetase dynamic range data
1 parent 07883b3 commit f5f12bc

2 files changed

Lines changed: 275 additions & 0 deletions

File tree

configs/no_grc.json

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"experiment_id": "no_grc",
3+
"suffix_time": false,
4+
"parca_options": {
5+
"cpus": 16
6+
},
7+
"fail_at_max_duration": true,
8+
"sim_data_path": null,
9+
"generations": 32,
10+
"n_init_sims": 16,
11+
"single_daughters": true,
12+
"emitter": "parquet",
13+
"emitter_arg": {
14+
"out_dir": "out"
15+
},
16+
"variants": {
17+
"condition": {
18+
"condition": {"value": ["with_aa"]}
19+
}
20+
},
21+
"analysis_options": {
22+
"single": {"mass_fraction_summary": {}},
23+
"multivariant": {"trna_synthetase_concs": {"skip_gens": 4}}
24+
},
25+
"steady_state_trna_charging": false,
26+
"kinetic_trna_charging": false,
27+
"coarse_kinetic_elongation": false,
28+
"ppgpp_regulation": false,
29+
"mechanistic_replisome": false,
30+
"mechanistic_translation_supply": false,
31+
"mechanistic_aa_transport": false,
32+
"translation_supply": false,
33+
"aa_supply_in_charging": false
34+
}
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
import pickle
2+
import warnings
3+
from datetime import datetime, timezone
4+
from io import StringIO
5+
from typing import Any
6+
7+
# noinspection PyUnresolvedReferences
8+
from duckdb import DuckDBPyConnection
9+
from scipy.constants import N_A
10+
import polars as pl
11+
12+
from ecoli.library.parquet_emitter import (
13+
read_stacked_columns,
14+
field_metadata,
15+
skip_n_gens,
16+
)
17+
from reconstruction.ecoli.fit_sim_data_1 import SimulationDataEcoli
18+
19+
20+
def plot(
21+
params: dict[str, Any],
22+
conn: DuckDBPyConnection,
23+
history_sql: str,
24+
config_sql: str,
25+
success_sql: str,
26+
sim_data_dict: dict[str, dict[int, str]],
27+
validation_data_paths: list[str],
28+
outdir: str,
29+
variant_metadata: dict[str, dict[int, Any]],
30+
variant_names: dict[str, str],
31+
):
32+
"""
33+
Analysis script for calculating the mean, standard deviation, and minimum values
34+
of tRNA synthetase concentrations for a set of baseline simulations (no growth
35+
rate control or kinetic tRNA charging) in minimal and rich media. The minimal
36+
workflow required to generate this data is ``configs/no_grc.json``. The output
37+
of this analysis should be used to replace
38+
``reconstruction/ecoli/flat/optimization/trna_synthetase_dynamic_range.tsv``
39+
every time changes are made that could affect tRNA synthetase concentrations.
40+
41+
Args:
42+
params: Dictionary of parameters that can include the following keys::
43+
44+
{
45+
"skip_gens": int # Number of initial generations to skip
46+
}
47+
48+
"""
49+
# Skip n generations if provided
50+
skip_gens = params.get("skip_gens")
51+
if skip_gens is not None:
52+
history_sql = skip_n_gens(history_sql, skip_gens)
53+
# Figure out which variant(s) are minimal and which are rich media
54+
experiment_ids = list(variant_names.keys())
55+
assert len(experiment_ids) == 1, (
56+
"This analysis script is only intended to be run with one experiment ID "
57+
"at a time."
58+
)
59+
experiment_id = experiment_ids[0]
60+
variant_name = variant_names[experiment_id]
61+
assert variant_name == "condition", (
62+
"This analysis script is only intended to be run with the 'condition' variant."
63+
)
64+
variant_id_data: dict[str, Any] = {
65+
"basal": {
66+
"variant_ids": [],
67+
"bulk_ids": [],
68+
"synthetase_ids": [],
69+
"synthetase_idx": [],
70+
"summary_stats": [],
71+
},
72+
"with_aa": {
73+
"variant_ids": [],
74+
"bulk_ids": [],
75+
"synthetase_ids": [],
76+
"synthetase_idx": [],
77+
"summary_stats": [],
78+
},
79+
}
80+
for variant_id, variant_params in variant_metadata[experiment_id].items():
81+
condition_label: str = ""
82+
if isinstance(variant_params, str):
83+
if variant_params != "baseline":
84+
warnings.warn(
85+
f"Ignoring variant {variant_id} with unexpected label: {variant_params}"
86+
)
87+
continue
88+
condition_label = "basal"
89+
elif isinstance(variant_params, dict):
90+
condition_label = variant_params.get("condition", "")
91+
else:
92+
warnings.warn(
93+
f"Ignoring variant {variant_id} with unsupported metadata type: {type(variant_params)}"
94+
)
95+
continue
96+
97+
media_info = variant_id_data[condition_label]
98+
if variant_id not in media_info["variant_ids"]:
99+
media_info["variant_ids"].append(variant_id)
100+
101+
for condition_label, media_info in variant_id_data.items():
102+
if len(media_info["variant_ids"]) == 0:
103+
raise ValueError(f"No variants found for media type: {condition_label}")
104+
ordered_bulk_ids: list[str] | None = None
105+
trna_synthetase_ids: list[str] | None = None
106+
for variant_id in media_info["variant_ids"]:
107+
var_subquery = f"FROM ({config_sql}) WHERE variant = {variant_id}"
108+
bulk_ids = field_metadata(conn, var_subquery, "bulk")
109+
if ordered_bulk_ids is None:
110+
ordered_bulk_ids = bulk_ids
111+
elif bulk_ids != ordered_bulk_ids:
112+
raise ValueError(
113+
f"Bulk ID order mismatch for media type '{condition_label}'. Ensure all "
114+
"variants list bulk IDs in the same order."
115+
)
116+
117+
sim_data_path = sim_data_dict[experiment_id].get(variant_id)
118+
if sim_data_path is None:
119+
raise ValueError(
120+
f"Missing sim_data path for experiment {experiment_id}, variant {variant_id}."
121+
)
122+
with open(sim_data_path, "rb") as f:
123+
sim_data: SimulationDataEcoli = pickle.load(f)
124+
variant_trna_synthetase_ids = list(
125+
sim_data.process.transcription.synthetase_names
126+
)
127+
if trna_synthetase_ids is None:
128+
trna_synthetase_ids = variant_trna_synthetase_ids
129+
elif variant_trna_synthetase_ids != trna_synthetase_ids:
130+
raise ValueError(
131+
"tRNA synthetase ID mismatch between variants. Ensure all variants "
132+
"have the same set of tRNA synthetases."
133+
)
134+
135+
if ordered_bulk_ids is None or trna_synthetase_ids is None:
136+
raise ValueError(
137+
f"Unable to determine metadata for media type '{condition_label}'."
138+
)
139+
media_info["synthetase_ids"] = trna_synthetase_ids
140+
media_info["bulk_ids"] = ordered_bulk_ids
141+
142+
# Get indices of tRNA synthetases in bulk IDs
143+
# and read concentration summary stats
144+
for condition_label, media_info in variant_id_data.items():
145+
synthetase_ids = media_info.get("synthetase_ids")
146+
bulk_ids = media_info.get("bulk_ids")
147+
if synthetase_ids is None or bulk_ids is None:
148+
raise ValueError(
149+
f"Missing bulk or synthetase metadata for media type '{condition_label}'."
150+
)
151+
media_info["synthetase_idx"] = []
152+
for synthetase_id in synthetase_ids:
153+
try:
154+
synthetase_idx = bulk_ids.index(synthetase_id)
155+
except ValueError:
156+
raise ValueError(
157+
f"tRNA synthetase ID '{synthetase_id}' not found in bulk IDs for "
158+
f"media type '{condition_label}'."
159+
)
160+
media_info["synthetase_idx"].append(synthetase_idx + 1) # 1-based indexing
161+
subquery = read_stacked_columns(
162+
history_sql,
163+
["bulk", "listeners__mass__volume AS volume"],
164+
order_results=False,
165+
)
166+
# Scale concs from counts / fL to umol / L
167+
# 1e15 fL / L * 1 mol / N_A counts * 1e6 umol / mol
168+
scale_factor = 1e15 / N_A * 1e6
169+
# Read each variant ID individually as DuckDB does not support filter
170+
# pushdown for IN statements (e.g. WHERE variant IN (...))
171+
for variant_id in media_info["variant_ids"]:
172+
variant_subquery = f"FROM ({subquery}) WHERE variant = {variant_id}"
173+
trna_synthetase_stats = conn.sql(f"""
174+
WITH synthetase_counts AS (
175+
SELECT list_select(bulk, {media_info["synthetase_idx"]}) AS counts,
176+
volume, variant
177+
FROM ({variant_subquery})
178+
),
179+
unnested_counts AS (
180+
SELECT unnest(counts) AS counts, volume,
181+
generate_subscripts(counts, 1) AS count_idx, variant
182+
FROM synthetase_counts
183+
),
184+
concs AS (
185+
SELECT counts / volume * {scale_factor} AS bulk_concs, count_idx, variant
186+
FROM unnested_counts
187+
)
188+
SELECT avg(bulk_concs) AS avg_concs, stddev(bulk_concs) AS std_concs,
189+
min(bulk_concs) AS min_concs, count_idx, variant
190+
FROM concs
191+
GROUP BY variant, count_idx
192+
ORDER BY variant, count_idx
193+
""").pl()
194+
stats_column_names = {
195+
"avg_concs": "mean (units.umol / units.L)",
196+
"std_concs": "std (units.umol / units.L)",
197+
"min_concs": "min (units.umol / units.L)",
198+
}
199+
synthetase_condition = [
200+
f"{synth_id}__{condition_label}"
201+
for synth_id in media_info["synthetase_ids"]
202+
]
203+
trna_synthetase_stats = (
204+
trna_synthetase_stats.rename(stats_column_names)
205+
.with_columns(
206+
pl.col("count_idx")
207+
.map_elements(
208+
lambda idx: synthetase_condition[idx - 1],
209+
return_dtype=pl.Utf8,
210+
)
211+
.alias("synthetase_condition")
212+
)
213+
.select(["synthetase_condition"] + list(stats_column_names.values()))
214+
)
215+
media_info["summary_stats"].append(trna_synthetase_stats)
216+
217+
# Combine and save data
218+
final_df = pl.concat(
219+
[
220+
pl.concat(condition_data["summary_stats"])
221+
for condition_data in variant_id_data.values()
222+
]
223+
).sort("synthetase_condition")
224+
225+
# Build metadata comments for reproducibility
226+
timestamp_utc = datetime.now(timezone.utc).isoformat()
227+
metadata_lines = [
228+
"# Generated by ecoli.analysis.multivariant.trna_synthetase_concs",
229+
f"# Run timestamp (UTC): {timestamp_utc}",
230+
f"# Experiment ID: {experiment_id}",
231+
f"# Skip n gens: {skip_gens if skip_gens is not None else '0'}",
232+
]
233+
234+
output_path = f"{outdir}/trna_synthetase_dynamic_range.tsv"
235+
buffer = StringIO()
236+
final_df.write_csv(buffer, separator="\t", quote_style="non_numeric")
237+
with open(output_path, "w", encoding="utf-8") as f:
238+
f.write("\n".join(metadata_lines))
239+
f.write("\n")
240+
f.write(buffer.getvalue())
241+
print(f"Saved tRNA synthetase dynamic range data to: {output_path}")

0 commit comments

Comments
 (0)