|
26 | 26 | per-base activation track -> the example cards |
27 | 27 |
|
28 | 28 | The heavy lifting is reused, not reimplemented: ``encode_batch`` (engine) for the |
29 | | -activations and ``sae.analysis.compute_feature_umap`` for the 2-D layout. |
| 29 | +activations and ``sae.analysis.compute_feature_umap`` for the 2-D layout. (Per-feature |
| 30 | +stats are computed locally from the SAE codes rather than via ``compute_feature_stats``, |
| 31 | +which wants raw pre-SAE activations — holding those for long DNA would not fit in memory.) |
| 32 | +
|
| 33 | +This runs over a SMALL representative corpus (``--max-sequences``), not the full atlas |
| 34 | +corpus: the 7B pass is here only because the example cards need sequence-aligned |
| 35 | +activations, which the anonymous token-level training cache cannot provide. |
| 36 | +
|
| 37 | +Feature *labels* are not produced here — they come from ``--feature-annotations`` (the |
| 38 | +feature-probing / label-producer pipeline, PR #1630) and are joined into the ``label`` |
| 39 | +column; unlabeled features fall back to ``Feature N``. Users can further rename in-UI. |
30 | 40 |
|
31 | 41 | Memory is bounded by a two-pass scheme (mirrors the codonfm generator): pass 1 keeps |
32 | 42 | only the per-(sequence, feature) max to pick top examples; pass 2 re-encodes just the |
@@ -59,9 +69,13 @@ def parse_args(): |
59 | 69 | p.add_argument("--layer", type=int, default=int(os.environ.get("EMBEDDING_LAYER", "26"))) |
60 | 70 | p.add_argument("--device", default=os.environ.get("DEVICE", "cuda")) |
61 | 71 | p.add_argument("--max-seq-len", type=int, default=int(os.environ.get("MAX_SEQ_LEN", "8192"))) |
62 | | - # Corpus + output. |
63 | | - p.add_argument("--fasta", required=True, help="FASTA corpus to characterize features over") |
| 72 | + # Corpus + output. This is meant to be a SMALL, representative corpus (a few thousand seqs): |
| 73 | + # we re-run the 7B over it only because the example cards need sequence-aligned activations, |
| 74 | + # which the (anonymous, token-level) training activation cache can't provide. It is NOT the |
| 75 | + # full atlas corpus — stats/UMAP need only a representative sample. |
| 76 | + p.add_argument("--fasta", required=True, help="SMALL representative FASTA (a few thousand seqs)") |
64 | 77 | p.add_argument("--output-dir", required=True, help="Directory to write the 3 parquets into") |
| 78 | + p.add_argument("--max-sequences", type=int, default=4000, help="Cap sequences read from --fasta (keep it small)") |
65 | 79 | p.add_argument("--organism", default="None (raw DNA)", help="Phylo-tag preset to prepend (default: raw DNA)") |
66 | 80 | p.add_argument("--batch-size", type=int, default=8) |
67 | 81 | p.add_argument("--n-examples", type=int, default=6, help="Top examples per feature") |
@@ -230,6 +244,8 @@ def main(): |
230 | 244 |
|
231 | 245 | ids, seqs = [], [] |
232 | 246 | for sid, seq in read_fasta(args.fasta): |
| 247 | + if len(seqs) >= args.max_sequences: |
| 248 | + break |
233 | 249 | ids.append(sid) |
234 | 250 | seqs.append(clean_dna(seq)) |
235 | 251 | if not seqs: |
|
0 commit comments