Skip to content

Commit 5aab046

Browse files
committed
Added evaluation data generation ribonn
1 parent 181b970 commit 5aab046

1 file changed

Lines changed: 93 additions & 0 deletions

File tree

  • bionemo-recipes/recipes/codonfm_native_te/evaluation/ribonn
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-Apache2
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Download and preprocess the RiboNN translation efficiency dataset.
17+
18+
Extracted verbatim from notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb (section 3).
19+
"""
20+
21+
import os
22+
import urllib.request
23+
from pathlib import Path
24+
25+
import polars as pl
26+
27+
28+
# Configurable dataset path
29+
data_path = "/data/validation/processed/data_with_human_TE_cellline_all_NA_plain.csv"
30+
31+
# Source URL for the TE dataset
32+
te_dataset_url = "https://raw.githubusercontent.com/CenikLab/TE_classic_ML/refs/heads/main/data/data_with_human_TE_cellline_all_NA_plain.csv"
33+
34+
# Ensure parent directory exists
35+
Path(os.path.dirname(data_path)).mkdir(parents=True, exist_ok=True)
36+
37+
# Download if missing
38+
if not os.path.exists(data_path):
39+
print(f"Downloading TE dataset to {data_path} ...")
40+
urllib.request.urlretrieve(te_dataset_url, data_path)
41+
print("Download complete.")
42+
else:
43+
print(f"Found existing dataset at {data_path}.")
44+
45+
46+
# Slice the transcript sequence into CDS / 5'UTR / 3'UTR using utr5_size and cds_size,
47+
# and add a row index column 'id'.
48+
data = pl.read_csv(data_path, separator="\t")
49+
data = data.with_columns(
50+
[
51+
pl.struct(["utr5_size", "cds_size", "tx_sequence"])
52+
.map_elements(
53+
lambda row: row["tx_sequence"][row["utr5_size"] : row["utr5_size"] + row["cds_size"]], return_dtype=pl.Utf8
54+
)
55+
.alias("cds_sequence"),
56+
pl.struct(["utr5_size", "tx_sequence"])
57+
.map_elements(lambda row: row["tx_sequence"][: row["utr5_size"]], return_dtype=pl.Utf8)
58+
.alias("utr5_sequence"),
59+
pl.struct(["utr5_size", "cds_size", "tx_sequence"])
60+
.map_elements(lambda row: row["tx_sequence"][row["utr5_size"] + row["cds_size"] :], return_dtype=pl.Utf8)
61+
.alias("utr3_sequence"),
62+
]
63+
).with_row_index("id")
64+
output_path = data_path[:-4] + ".processed.csv"
65+
data.write_csv(output_path)
66+
67+
68+
# Load processed RiboNN dataset and report basic statistics on the mean_te target.
69+
data_loaded = False
70+
if os.path.exists(output_path):
71+
try:
72+
data = pl.read_csv(output_path)
73+
print(f"✅ Loaded {len(data)} sequences from: {output_path}")
74+
print(f"Shape: {data.shape}")
75+
print(f"Key columns: {[col for col in ['id', 'cds_sequence', 'mean_te', 'fold'] if col in data.columns]}")
76+
77+
data_loaded = True
78+
except Exception as e:
79+
print(f"Failed to load {output_path}: {e}")
80+
81+
# Show basic statistics
82+
te_stats = data.select(
83+
[
84+
pl.col("mean_te").mean().alias("mean"),
85+
pl.col("mean_te").std().alias("std"),
86+
pl.col("mean_te").min().alias("min"),
87+
pl.col("mean_te").max().alias("max"),
88+
]
89+
)
90+
print("\nTranslation Efficiency stats:")
91+
print(f" Mean: {te_stats['mean'][0]:.4f}")
92+
print(f" Range: [{te_stats['min'][0]:.4f}, {te_stats['max'][0]:.4f}]")
93+
data_loaded = True

0 commit comments

Comments
 (0)