Distillative-AI
diff --git a/‎bionemo-recipes/interpretability/sparse_autoencoders/recipes/evo2/pyproject.toml‎
Lines changed: 25 additions & 0 deletions b/‎bionemo-recipes/interpretability/sparse_autoencoders/recipes/evo2/pyproject.toml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎bionemo-recipes/interpretability/sparse_autoencoders/recipes/evo2/scripts/7b.sh‎
Lines changed: 95 additions & 0 deletions b/‎bionemo-recipes/interpretability/sparse_autoencoders/recipes/evo2/scripts/7b.sh‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎bionemo-recipes/interpretability/sparse_autoencoders/recipes/evo2/scripts/chunk_fasta.py‎
Lines changed: 77 additions & 0 deletions b/‎bionemo-recipes/interpretability/sparse_autoencoders/recipes/evo2/scripts/chunk_fasta.py‎
Lines changed: 77 additions & 0 deletions
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "evo2-sae"
+version = "0.1.0"
+description = "Sparse Autoencoders for the Evo2 DNA language model"
+requires-python = ">=3.10"
+
+dependencies = [
+    "sae",
+    "torch>=2.0",
+    "numpy>=1.20",
+    "pyarrow>=23.0.0",
+]
+
+# No package code lives here yet — the recipe is just an entry-point for
+# scripts/ that depends on the shared `sae` workspace package. Declare no
+# packages so setuptools doesn't try to discover anything.
+[tool.setuptools]
+packages = []
+
+[tool.uv.sources]
+sae = { workspace = true }
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Evo2 7B layer-26 SAE recipe: chunk FASTA -> stream-extract activations -> train SAE.
+# This reproduces the layer26_7B (normalize_input) run.
+#
+# Prerequisites (this recipe does NOT download or convert the model):
+#   - An Evo2 7B *MBridge* checkpoint directory (CKPT_DIR). Obtain it from NGC, e.g.:
+#         ngc registry model download-version "nvidia/clara/evo2:7b_<ver>" --dest "${WORK_ROOT}/checkpoints"
+#     (or convert a nemo2 checkpoint to MBridge with the evo2_megatron converter).
+#   - bionemo-recipes/recipes/evo2_megatron built (.ci_build.sh) with its .venv active,
+#     providing `predict_evo2`.
+#   - The `sae` workspace package importable in that same venv.
+#
+# Override any of these by exporting before invocation.
+
+set -euo pipefail
+
+EVO2_MEGATRON_DIR="${EVO2_MEGATRON_DIR:-/workspace/bionemo-framework/bionemo-recipes/recipes/evo2_megatron}"
+RECIPE_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+
+LAYER="${LAYER:-26}"
+# Context length the activations were extracted at (the model is context-extended; we
+# trained the SAE on 8192-bp chunks).
+CHUNK_BP="${CHUNK_BP:-8192}"
+
+# An Evo2 7B MBridge checkpoint directory (see prerequisites above).
+CKPT_DIR="${CKPT_DIR:?Set CKPT_DIR to an Evo2 7B MBridge checkpoint directory (see header)}"
+FASTA="${FASTA:?Set FASTA to the (prok+euk) input sequences}"
+WORK_ROOT="${WORK_ROOT:-/data/interp/evo2}"
+
+NPROC="${NPROC:-8}"            # GPUs / DP ranks
+MAX_TOKENS="${MAX_TOKENS:-1000000000}"
+
+PARQUET_DIR="${WORK_ROOT}/activations/evo2_7b_layer${LAYER}_parquet"
+OUTPUT_DIR="${WORK_ROOT}/sae/evo2_7b_layer${LAYER}"
+
+source "${EVO2_MEGATRON_DIR}/.venv/bin/activate"
+
+echo "============================================================"
+echo "STEP 0: Chunk FASTA to <=${CHUNK_BP} bp"
+echo "============================================================"
+INPUT_STEM="$(basename "$FASTA")"; INPUT_STEM="${INPUT_STEM%.gz}"; INPUT_STEM="${INPUT_STEM%.fasta}"
+CHUNKED_FASTA="${WORK_ROOT}/scratch/${INPUT_STEM}_chunked${CHUNK_BP}.fasta"
+if [[ -f "$CHUNKED_FASTA" ]]; then
+    echo "Reusing existing chunked FASTA: $CHUNKED_FASTA"
+else
+    python "${RECIPE_DIR}/scripts/chunk_fasta.py" --input "$FASTA" --output "$CHUNKED_FASTA" --window "$CHUNK_BP"
+fi
+
+echo "============================================================"
+echo "STEP 1: Stream-extract layer-${LAYER} activations -> parquet ActivationStore (no .pt)"
+echo "============================================================"
+if [[ -f "${PARQUET_DIR}/metadata.json" ]]; then
+    echo "Reusing existing parquet shards at $PARQUET_DIR"
+else
+    torchrun --nproc_per_node="$NPROC" "${RECIPE_DIR}/scripts/extract.py" \
+        --ckpt-dir "$CKPT_DIR" \
+        --embedding-layer "$LAYER" \
+        --fasta "$CHUNKED_FASTA" \
+        --activation-store-dir "$PARQUET_DIR" \
+        --max-tokens "$MAX_TOKENS" \
+        --micro-batch-size 4 \
+        --dtype fp32
+fi
+
+echo "============================================================"
+echo "STEP 2: Train TopK SAE (layer26_7B normalize_input config)"
+echo "============================================================"
+# unset a leaked key so ~/.netrc wins; clara-discovery is the wandb entity.
+unset WANDB_API_KEY || true
+export WANDB_ENTITY="${WANDB_ENTITY:-clara-discovery}"
+torchrun --nproc_per_node="$NPROC" "${RECIPE_DIR}/scripts/train.py" \
+    --cache-dir "$PARQUET_DIR" \
+    --model-path "$CKPT_DIR" \
+    --layer "$LAYER" \
+    --model-type topk \
+    --expansion-factor 16 --top-k 128 \
+    --normalize-input \
+    --auxk 2048 --auxk-coef 0.03125 \
+    --dead-tokens-threshold 10000000 \
+    --init-pre-bias \
+    --n-epochs 1 \
+    --batch-size 1024 \
+    --lr 1e-4 --lr-schedule cosine --lr-min 1e-5 --warmup-steps 1000 \
+    --max-grad-norm 1.0 \
+    --mix-shards 10 \
+    --dp-size "$NPROC" \
+    --log-interval 100 \
+    --wandb --wandb-project evo2-sae-v2-diverse --wandb-run-name "layer${LAYER}_7B_normalize_input" \
+    --output-dir "$OUTPUT_DIR" \
+    --checkpoint-dir "${OUTPUT_DIR}/checkpoints" \
+    --checkpoint-steps 2000
+
+echo "============================================================"
+echo "DONE: SAE checkpoint at ${OUTPUT_DIR}/checkpoints/checkpoint_final.pt"
+echo "============================================================"
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Chunk a FASTA into <=N-bp windows so predict_evo2 stays inside the model's trained context.
+
+Evo2 1B was trained with seq_length=8192; longer inputs OOM in the Hyena
+fftconv path (intermediates scale super-linearly with L). For 7B/40B raise
+--window to whatever those checkpoints were context-extended to.
+
+Non-overlapping windows by default. Each chunk gets a header of the form
+">{orig_id}:{start}-{end}" so downstream parquet can be back-mapped.
+"""
+
+import argparse
+import gzip
+from pathlib import Path
+
+
+def parse_fasta(path: Path):
+    """Yield (seq_id, sequence) tuples from a FASTA file (transparently handles .gz)."""
+    opener = gzip.open if path.suffix == ".gz" else open
+    seq_id, parts = None, []
+    with opener(path, "rt") as f:
+        for line in f:
+            line = line.rstrip()
+            if line.startswith(">"):
+                if seq_id is not None:
+                    yield seq_id, "".join(parts)
+                seq_id = line[1:].split()[0]
+                parts = []
+            else:
+                parts.append(line)
+        if seq_id is not None:
+            yield seq_id, "".join(parts)
+
+
+def main():
+    """Read input FASTA, write non-overlapping <=window-bp chunks to output FASTA."""
+    p = argparse.ArgumentParser()
+    p.add_argument("--input", type=Path, required=True)
+    p.add_argument("--output", type=Path, required=True)
+    p.add_argument("--window", type=int, default=8192)
+    args = p.parse_args()
+    if args.window <= 0:
+        p.error("--window must be a positive integer")
+    if args.input.resolve() == args.output.resolve():
+        p.error("--input and --output must be different files")
+
+    n_in = n_out = bp_out = 0
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as out:
+        for seq_id, seq in parse_fasta(args.input):
+            n_in += 1
+            for start in range(0, len(seq), args.window):
+                end = min(start + args.window, len(seq))
+                chunk = seq[start:end]
+                out.write(f">{seq_id}:{start}-{end}\n{chunk}\n")
+                n_out += 1
+                bp_out += len(chunk)
+
+    print(f"Chunked {n_in} sequences -> {n_out} chunks ({bp_out:,} bp) at window={args.window}")
+
+
+if __name__ == "__main__":
+    main()