|
| 1 | +import pickle |
| 2 | +import warnings |
| 3 | +from datetime import datetime, timezone |
| 4 | +from io import StringIO |
| 5 | +from typing import Any |
| 6 | + |
| 7 | +# noinspection PyUnresolvedReferences |
| 8 | +from duckdb import DuckDBPyConnection |
| 9 | +from scipy.constants import N_A |
| 10 | +import polars as pl |
| 11 | + |
| 12 | +from ecoli.library.parquet_emitter import ( |
| 13 | + read_stacked_columns, |
| 14 | + field_metadata, |
| 15 | + skip_n_gens, |
| 16 | +) |
| 17 | +from reconstruction.ecoli.fit_sim_data_1 import SimulationDataEcoli |
| 18 | + |
| 19 | + |
| 20 | +def plot( |
| 21 | + params: dict[str, Any], |
| 22 | + conn: DuckDBPyConnection, |
| 23 | + history_sql: str, |
| 24 | + config_sql: str, |
| 25 | + success_sql: str, |
| 26 | + sim_data_dict: dict[str, dict[int, str]], |
| 27 | + validation_data_paths: list[str], |
| 28 | + outdir: str, |
| 29 | + variant_metadata: dict[str, dict[int, Any]], |
| 30 | + variant_names: dict[str, str], |
| 31 | +): |
| 32 | + """ |
| 33 | + Analysis script for calculating the mean, standard deviation, and minimum values |
| 34 | + of tRNA synthetase concentrations for a set of baseline simulations (no growth |
| 35 | + rate control or kinetic tRNA charging) in minimal and rich media. The minimal |
| 36 | + workflow required to generate this data is ``configs/no_grc.json``. The output |
| 37 | + of this analysis should be used to replace |
| 38 | + ``reconstruction/ecoli/flat/optimization/trna_synthetase_dynamic_range.tsv`` |
| 39 | + every time changes are made that could affect tRNA synthetase concentrations. |
| 40 | +
|
| 41 | + Args: |
| 42 | + params: Dictionary of parameters that can include the following keys:: |
| 43 | +
|
| 44 | + { |
| 45 | + "skip_gens": int # Number of initial generations to skip |
| 46 | + } |
| 47 | +
|
| 48 | + """ |
| 49 | + # Skip n generations if provided |
| 50 | + skip_gens = params.get("skip_gens") |
| 51 | + if skip_gens is not None: |
| 52 | + history_sql = skip_n_gens(history_sql, skip_gens) |
| 53 | + # Figure out which variant(s) are minimal and which are rich media |
| 54 | + experiment_ids = list(variant_names.keys()) |
| 55 | + assert len(experiment_ids) == 1, ( |
| 56 | + "This analysis script is only intended to be run with one experiment ID " |
| 57 | + "at a time." |
| 58 | + ) |
| 59 | + experiment_id = experiment_ids[0] |
| 60 | + variant_name = variant_names[experiment_id] |
| 61 | + assert variant_name == "condition", ( |
| 62 | + "This analysis script is only intended to be run with the 'condition' variant." |
| 63 | + ) |
| 64 | + variant_id_data: dict[str, Any] = { |
| 65 | + "basal": { |
| 66 | + "variant_ids": [], |
| 67 | + "bulk_ids": [], |
| 68 | + "synthetase_ids": [], |
| 69 | + "synthetase_idx": [], |
| 70 | + "summary_stats": [], |
| 71 | + }, |
| 72 | + "with_aa": { |
| 73 | + "variant_ids": [], |
| 74 | + "bulk_ids": [], |
| 75 | + "synthetase_ids": [], |
| 76 | + "synthetase_idx": [], |
| 77 | + "summary_stats": [], |
| 78 | + }, |
| 79 | + } |
| 80 | + for variant_id, variant_params in variant_metadata[experiment_id].items(): |
| 81 | + condition_label: str = "" |
| 82 | + if isinstance(variant_params, str): |
| 83 | + if variant_params != "baseline": |
| 84 | + warnings.warn( |
| 85 | + f"Ignoring variant {variant_id} with unexpected label: {variant_params}" |
| 86 | + ) |
| 87 | + continue |
| 88 | + condition_label = "basal" |
| 89 | + elif isinstance(variant_params, dict): |
| 90 | + condition_label = variant_params.get("condition", "") |
| 91 | + else: |
| 92 | + warnings.warn( |
| 93 | + f"Ignoring variant {variant_id} with unsupported metadata type: {type(variant_params)}" |
| 94 | + ) |
| 95 | + continue |
| 96 | + |
| 97 | + media_info = variant_id_data[condition_label] |
| 98 | + if variant_id not in media_info["variant_ids"]: |
| 99 | + media_info["variant_ids"].append(variant_id) |
| 100 | + |
| 101 | + for condition_label, media_info in variant_id_data.items(): |
| 102 | + if len(media_info["variant_ids"]) == 0: |
| 103 | + raise ValueError(f"No variants found for media type: {condition_label}") |
| 104 | + ordered_bulk_ids: list[str] | None = None |
| 105 | + trna_synthetase_ids: list[str] | None = None |
| 106 | + for variant_id in media_info["variant_ids"]: |
| 107 | + var_subquery = f"FROM ({config_sql}) WHERE variant = {variant_id}" |
| 108 | + bulk_ids = field_metadata(conn, var_subquery, "bulk") |
| 109 | + if ordered_bulk_ids is None: |
| 110 | + ordered_bulk_ids = bulk_ids |
| 111 | + elif bulk_ids != ordered_bulk_ids: |
| 112 | + raise ValueError( |
| 113 | + f"Bulk ID order mismatch for media type '{condition_label}'. Ensure all " |
| 114 | + "variants list bulk IDs in the same order." |
| 115 | + ) |
| 116 | + |
| 117 | + sim_data_path = sim_data_dict[experiment_id].get(variant_id) |
| 118 | + if sim_data_path is None: |
| 119 | + raise ValueError( |
| 120 | + f"Missing sim_data path for experiment {experiment_id}, variant {variant_id}." |
| 121 | + ) |
| 122 | + with open(sim_data_path, "rb") as f: |
| 123 | + sim_data: SimulationDataEcoli = pickle.load(f) |
| 124 | + variant_trna_synthetase_ids = list( |
| 125 | + sim_data.process.transcription.synthetase_names |
| 126 | + ) |
| 127 | + if trna_synthetase_ids is None: |
| 128 | + trna_synthetase_ids = variant_trna_synthetase_ids |
| 129 | + elif variant_trna_synthetase_ids != trna_synthetase_ids: |
| 130 | + raise ValueError( |
| 131 | + "tRNA synthetase ID mismatch between variants. Ensure all variants " |
| 132 | + "have the same set of tRNA synthetases." |
| 133 | + ) |
| 134 | + |
| 135 | + if ordered_bulk_ids is None or trna_synthetase_ids is None: |
| 136 | + raise ValueError( |
| 137 | + f"Unable to determine metadata for media type '{condition_label}'." |
| 138 | + ) |
| 139 | + media_info["synthetase_ids"] = trna_synthetase_ids |
| 140 | + media_info["bulk_ids"] = ordered_bulk_ids |
| 141 | + |
| 142 | + # Get indices of tRNA synthetases in bulk IDs |
| 143 | + # and read concentration summary stats |
| 144 | + for condition_label, media_info in variant_id_data.items(): |
| 145 | + synthetase_ids = media_info.get("synthetase_ids") |
| 146 | + bulk_ids = media_info.get("bulk_ids") |
| 147 | + if synthetase_ids is None or bulk_ids is None: |
| 148 | + raise ValueError( |
| 149 | + f"Missing bulk or synthetase metadata for media type '{condition_label}'." |
| 150 | + ) |
| 151 | + media_info["synthetase_idx"] = [] |
| 152 | + for synthetase_id in synthetase_ids: |
| 153 | + try: |
| 154 | + synthetase_idx = bulk_ids.index(synthetase_id) |
| 155 | + except ValueError: |
| 156 | + raise ValueError( |
| 157 | + f"tRNA synthetase ID '{synthetase_id}' not found in bulk IDs for " |
| 158 | + f"media type '{condition_label}'." |
| 159 | + ) |
| 160 | + media_info["synthetase_idx"].append(synthetase_idx + 1) # 1-based indexing |
| 161 | + subquery = read_stacked_columns( |
| 162 | + history_sql, |
| 163 | + ["bulk", "listeners__mass__volume AS volume"], |
| 164 | + order_results=False, |
| 165 | + ) |
| 166 | + # Scale concs from counts / fL to umol / L |
| 167 | + # 1e15 fL / L * 1 mol / N_A counts * 1e6 umol / mol |
| 168 | + scale_factor = 1e15 / N_A * 1e6 |
| 169 | + # Read each variant ID individually as DuckDB does not support filter |
| 170 | + # pushdown for IN statements (e.g. WHERE variant IN (...)) |
| 171 | + for variant_id in media_info["variant_ids"]: |
| 172 | + variant_subquery = f"FROM ({subquery}) WHERE variant = {variant_id}" |
| 173 | + trna_synthetase_stats = conn.sql(f""" |
| 174 | + WITH synthetase_counts AS ( |
| 175 | + SELECT list_select(bulk, {media_info["synthetase_idx"]}) AS counts, |
| 176 | + volume, variant |
| 177 | + FROM ({variant_subquery}) |
| 178 | + ), |
| 179 | + unnested_counts AS ( |
| 180 | + SELECT unnest(counts) AS counts, volume, |
| 181 | + generate_subscripts(counts, 1) AS count_idx, variant |
| 182 | + FROM synthetase_counts |
| 183 | + ), |
| 184 | + concs AS ( |
| 185 | + SELECT counts / volume * {scale_factor} AS bulk_concs, count_idx, variant |
| 186 | + FROM unnested_counts |
| 187 | + ) |
| 188 | + SELECT avg(bulk_concs) AS avg_concs, stddev(bulk_concs) AS std_concs, |
| 189 | + min(bulk_concs) AS min_concs, count_idx, variant |
| 190 | + FROM concs |
| 191 | + GROUP BY variant, count_idx |
| 192 | + ORDER BY variant, count_idx |
| 193 | + """).pl() |
| 194 | + stats_column_names = { |
| 195 | + "avg_concs": "mean (units.umol / units.L)", |
| 196 | + "std_concs": "std (units.umol / units.L)", |
| 197 | + "min_concs": "min (units.umol / units.L)", |
| 198 | + } |
| 199 | + synthetase_condition = [ |
| 200 | + f"{synth_id}__{condition_label}" |
| 201 | + for synth_id in media_info["synthetase_ids"] |
| 202 | + ] |
| 203 | + trna_synthetase_stats = ( |
| 204 | + trna_synthetase_stats.rename(stats_column_names) |
| 205 | + .with_columns( |
| 206 | + pl.col("count_idx") |
| 207 | + .map_elements( |
| 208 | + lambda idx: synthetase_condition[idx - 1], |
| 209 | + return_dtype=pl.Utf8, |
| 210 | + ) |
| 211 | + .alias("synthetase_condition") |
| 212 | + ) |
| 213 | + .select(["synthetase_condition"] + list(stats_column_names.values())) |
| 214 | + ) |
| 215 | + media_info["summary_stats"].append(trna_synthetase_stats) |
| 216 | + |
| 217 | + # Combine and save data |
| 218 | + final_df = pl.concat( |
| 219 | + [ |
| 220 | + pl.concat(condition_data["summary_stats"]) |
| 221 | + for condition_data in variant_id_data.values() |
| 222 | + ] |
| 223 | + ).sort("synthetase_condition") |
| 224 | + |
| 225 | + # Build metadata comments for reproducibility |
| 226 | + timestamp_utc = datetime.now(timezone.utc).isoformat() |
| 227 | + metadata_lines = [ |
| 228 | + "# Generated by ecoli.analysis.multivariant.trna_synthetase_concs", |
| 229 | + f"# Run timestamp (UTC): {timestamp_utc}", |
| 230 | + f"# Experiment ID: {experiment_id}", |
| 231 | + f"# Skip n gens: {skip_gens if skip_gens is not None else '0'}", |
| 232 | + ] |
| 233 | + |
| 234 | + output_path = f"{outdir}/trna_synthetase_dynamic_range.tsv" |
| 235 | + buffer = StringIO() |
| 236 | + final_df.write_csv(buffer, separator="\t", quote_style="non_numeric") |
| 237 | + with open(output_path, "w", encoding="utf-8") as f: |
| 238 | + f.write("\n".join(metadata_lines)) |
| 239 | + f.write("\n") |
| 240 | + f.write(buffer.getvalue()) |
| 241 | + print(f"Saved tRNA synthetase dynamic range data to: {output_path}") |
0 commit comments