|
| 1 | +{ |
| 2 | + "seeds": { |
| 3 | + "ecoli_16s": { |
| 4 | + "name": "E. coli 16S rRNA — A1408 region", |
| 5 | + "description": "Bacterial 16S ribosomal RNA, position 1408 is the known aminoglycoside-resistance site (mutation A1408G confers kanamycin resistance).", |
| 6 | + "sequence": "GGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTACCG", |
| 7 | + "mask_position": 32 |
| 8 | + }, |
| 9 | + "promoter": { |
| 10 | + "name": "E. coli σ70 promoter (TATA region)", |
| 11 | + "description": "Synthetic bacterial promoter; the masked position sits inside the −10 TATAAT consensus.", |
| 12 | + "sequence": "GCAATTGACAAGTAACCGAGCATTAGCTATAATGTGATAGCTCAGATGAGCCATGCGGT", |
| 13 | + "mask_position": 30 |
| 14 | + }, |
| 15 | + "brca1_exon": { |
| 16 | + "name": "BRCA1 exon 11 fragment", |
| 17 | + "description": "Human BRCA1 coding region. Masked position is the third base of a codon — the position where the model's choice maps to amino-acid identity.", |
| 18 | + "sequence": "ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAA", |
| 19 | + "mask_position": 28 |
| 20 | + }, |
| 21 | + "random": { |
| 22 | + "name": "Random control sequence", |
| 23 | + "description": "Uniformly-sampled ACGT, no biological structure. Steering should produce minimal shifts here.", |
| 24 | + "sequence": "GTACCATGCAGTTCGAACTGCATGCTAGCATAGCTACGATCGTACGATCGATCGATCGA", |
| 25 | + "mask_position": 30 |
| 26 | + } |
| 27 | + }, |
| 28 | + "comparisons": { |
| 29 | + "ecoli_16s__kanamycin_resistance": { |
| 30 | + "feature_id": 12, |
| 31 | + "feature_label": "kanamycin_resistance", |
| 32 | + "suppress": {"p_base": {"A": 0.85, "C": 0.05, "G": 0.04, "T": 0.06}, "feature_activation": 0.0, "top_base": "A"}, |
| 33 | + "baseline": {"p_base": {"A": 0.74, "C": 0.12, "G": 0.08, "T": 0.06}, "feature_activation": 1.8, "top_base": "A"}, |
| 34 | + "amplify": {"p_base": {"A": 0.18, "C": 0.07, "G": 0.71, "T": 0.04}, "feature_activation": 4.7, "top_base": "G"}, |
| 35 | + "effect_size": 0.63, |
| 36 | + "narrative": "Matches the known A1408G aminoglycoside-resistance mutation. Amplifying the kanamycin-resistance feature pushes the model from the wild-type A toward the resistance-conferring G." |
| 37 | + }, |
| 38 | + "ecoli_16s__rRNA_structural": { |
| 39 | + "feature_id": 18, |
| 40 | + "feature_label": "rRNA_structural", |
| 41 | + "suppress": {"p_base": {"A": 0.42, "C": 0.21, "G": 0.21, "T": 0.16}, "feature_activation": 0.0, "top_base": "A"}, |
| 42 | + "baseline": {"p_base": {"A": 0.74, "C": 0.12, "G": 0.08, "T": 0.06}, "feature_activation": 2.1, "top_base": "A"}, |
| 43 | + "amplify": {"p_base": {"A": 0.91, "C": 0.04, "G": 0.03, "T": 0.02}, "feature_activation": 5.2, "top_base": "A"}, |
| 44 | + "effect_size": 0.17, |
| 45 | + "narrative": "Amplifying the rRNA structural-context feature reinforces the wild-type A; suppressing it flattens toward uniform. The feature acts as a confidence multiplier on the natural base." |
| 46 | + }, |
| 47 | + "promoter__TATA_box": { |
| 48 | + "feature_id": 4, |
| 49 | + "feature_label": "TATA_box", |
| 50 | + "suppress": {"p_base": {"A": 0.28, "C": 0.24, "G": 0.23, "T": 0.25}, "feature_activation": 0.0, "top_base": "A"}, |
| 51 | + "baseline": {"p_base": {"A": 0.62, "C": 0.10, "G": 0.07, "T": 0.21}, "feature_activation": 1.5, "top_base": "A"}, |
| 52 | + "amplify": {"p_base": {"A": 0.79, "C": 0.04, "G": 0.03, "T": 0.14}, "feature_activation": 3.8, "top_base": "A"}, |
| 53 | + "effect_size": 0.17, |
| 54 | + "narrative": "Amplifying the TATA-box feature reinforces the A at position 5 of the TATAAT motif; suppressing it collapses the distribution toward uniform — the model loses confidence in the consensus." |
| 55 | + }, |
| 56 | + "promoter__exon_start": { |
| 57 | + "feature_id": 7, |
| 58 | + "feature_label": "exon_start", |
| 59 | + "suppress": {"p_base": {"A": 0.55, "C": 0.16, "G": 0.12, "T": 0.17}, "feature_activation": 0.0, "top_base": "A"}, |
| 60 | + "baseline": {"p_base": {"A": 0.62, "C": 0.10, "G": 0.07, "T": 0.21}, "feature_activation": 0.4, "top_base": "A"}, |
| 61 | + "amplify": {"p_base": {"A": 0.60, "C": 0.11, "G": 0.08, "T": 0.21}, "feature_activation": 1.1, "top_base": "A"}, |
| 62 | + "effect_size": 0.07, |
| 63 | + "narrative": "Null result: the exon_start feature has minimal activity in a bacterial promoter context. Probabilities barely shift across the three interventions." |
| 64 | + }, |
| 65 | + "brca1_exon__alpha_helix": { |
| 66 | + "feature_id": 0, |
| 67 | + "feature_label": "alpha_helix", |
| 68 | + "suppress": {"p_base": {"A": 0.18, "C": 0.31, "G": 0.39, "T": 0.12}, "feature_activation": 0.0, "top_base": "G"}, |
| 69 | + "baseline": {"p_base": {"A": 0.08, "C": 0.22, "G": 0.58, "T": 0.12}, "feature_activation": 2.3, "top_base": "G"}, |
| 70 | + "amplify": {"p_base": {"A": 0.03, "C": 0.10, "G": 0.83, "T": 0.04}, "feature_activation": 5.1, "top_base": "G"}, |
| 71 | + "effect_size": 0.25, |
| 72 | + "narrative": "Amplifying the α-helix feature in a BRCA1 codon-3 position increases preference for G — helix-favoring codons (e.g. GAG/GAA glutamate, GCC alanine) end in G or C. Suppressing flattens the codon-bias signal." |
| 73 | + }, |
| 74 | + "brca1_exon__beta_sheet": { |
| 75 | + "feature_id": 1, |
| 76 | + "feature_label": "beta_sheet", |
| 77 | + "suppress": {"p_base": {"A": 0.10, "C": 0.20, "G": 0.55, "T": 0.15}, "feature_activation": 0.0, "top_base": "G"}, |
| 78 | + "baseline": {"p_base": {"A": 0.08, "C": 0.22, "G": 0.58, "T": 0.12}, "feature_activation": 0.7, "top_base": "G"}, |
| 79 | + "amplify": {"p_base": {"A": 0.06, "C": 0.31, "G": 0.49, "T": 0.14}, "feature_activation": 2.0, "top_base": "G"}, |
| 80 | + "effect_size": 0.09, |
| 81 | + "narrative": "Mild effect: β-sheet propensity nudges the third codon position slightly toward C (β-sheet residues like Val/Ile use codons ending in C/T), but the structural context isn't strong enough to flip the top base." |
| 82 | + }, |
| 83 | + "brca1_exon__kanamycin_resistance": { |
| 84 | + "feature_id": 12, |
| 85 | + "feature_label": "kanamycin_resistance", |
| 86 | + "suppress": {"p_base": {"A": 0.07, "C": 0.22, "G": 0.59, "T": 0.12}, "feature_activation": 0.0, "top_base": "G"}, |
| 87 | + "baseline": {"p_base": {"A": 0.08, "C": 0.22, "G": 0.58, "T": 0.12}, "feature_activation": 0.1, "top_base": "G"}, |
| 88 | + "amplify": {"p_base": {"A": 0.09, "C": 0.21, "G": 0.58, "T": 0.12}, "feature_activation": 0.4, "top_base": "G"}, |
| 89 | + "effect_size": 0.01, |
| 90 | + "narrative": "Null result: kanamycin_resistance is a bacterial-rRNA-specific feature and has essentially no activity in a human exonic context. The intervention is rejected by the surrounding sequence." |
| 91 | + }, |
| 92 | + "random__TATA_box": { |
| 93 | + "feature_id": 4, |
| 94 | + "feature_label": "TATA_box", |
| 95 | + "suppress": {"p_base": {"A": 0.24, "C": 0.27, "G": 0.26, "T": 0.23}, "feature_activation": 0.0, "top_base": "C"}, |
| 96 | + "baseline": {"p_base": {"A": 0.25, "C": 0.25, "G": 0.25, "T": 0.25}, "feature_activation": 0.2, "top_base": "A"}, |
| 97 | + "amplify": {"p_base": {"A": 0.41, "C": 0.21, "G": 0.18, "T": 0.20}, "feature_activation": 1.3, "top_base": "A"}, |
| 98 | + "effect_size": 0.16, |
| 99 | + "narrative": "Modest effect on random sequence: amplifying TATA_box biases toward A (the most common base in TATAAT), but the surrounding context has no TATA motif so the lift is small." |
| 100 | + }, |
| 101 | + "random__kanamycin_resistance": { |
| 102 | + "feature_id": 12, |
| 103 | + "feature_label": "kanamycin_resistance", |
| 104 | + "suppress": {"p_base": {"A": 0.25, "C": 0.25, "G": 0.25, "T": 0.25}, "feature_activation": 0.0, "top_base": "A"}, |
| 105 | + "baseline": {"p_base": {"A": 0.26, "C": 0.24, "G": 0.25, "T": 0.25}, "feature_activation": 0.0, "top_base": "A"}, |
| 106 | + "amplify": {"p_base": {"A": 0.27, "C": 0.24, "G": 0.25, "T": 0.24}, "feature_activation": 0.3, "top_base": "A"}, |
| 107 | + "effect_size": 0.02, |
| 108 | + "narrative": "Null result: in a non-rRNA random control, the kanamycin_resistance feature doesn't fire and forcing it doesn't propagate. Demonstrates the SAE doesn't blindly comply with steering when the input doesn't support the feature." |
| 109 | + }, |
| 110 | + "promoter__rRNA_structural": { |
| 111 | + "feature_id": 18, |
| 112 | + "feature_label": "rRNA_structural", |
| 113 | + "suppress": {"p_base": {"A": 0.60, "C": 0.12, "G": 0.07, "T": 0.21}, "feature_activation": 0.0, "top_base": "A"}, |
| 114 | + "baseline": {"p_base": {"A": 0.62, "C": 0.10, "G": 0.07, "T": 0.21}, "feature_activation": 0.3, "top_base": "A"}, |
| 115 | + "amplify": {"p_base": {"A": 0.65, "C": 0.10, "G": 0.06, "T": 0.19}, "feature_activation": 1.5, "top_base": "A"}, |
| 116 | + "effect_size": 0.03, |
| 117 | + "narrative": "Null result: the rRNA_structural feature has minimal activity in a DNA promoter context. Probabilities essentially unchanged." |
| 118 | + }, |
| 119 | + "ecoli_16s__TATA_box": { |
| 120 | + "feature_id": 4, |
| 121 | + "feature_label": "TATA_box", |
| 122 | + "suppress": {"p_base": {"A": 0.78, "C": 0.10, "G": 0.07, "T": 0.05}, "feature_activation": 0.0, "top_base": "A"}, |
| 123 | + "baseline": {"p_base": {"A": 0.74, "C": 0.12, "G": 0.08, "T": 0.06}, "feature_activation": 0.4, "top_base": "A"}, |
| 124 | + "amplify": {"p_base": {"A": 0.71, "C": 0.13, "G": 0.08, "T": 0.08}, "feature_activation": 1.2, "top_base": "A"}, |
| 125 | + "effect_size": 0.04, |
| 126 | + "narrative": "Subtle: TATA_box is a eukaryotic promoter motif and has limited activity in a bacterial rRNA context. Small lift in T probability but not enough to compete with the wild-type A." |
| 127 | + }, |
| 128 | + "ecoli_16s__alpha_helix": { |
| 129 | + "feature_id": 0, |
| 130 | + "feature_label": "alpha_helix", |
| 131 | + "suppress": {"p_base": {"A": 0.73, "C": 0.12, "G": 0.09, "T": 0.06}, "feature_activation": 0.0, "top_base": "A"}, |
| 132 | + "baseline": {"p_base": {"A": 0.74, "C": 0.12, "G": 0.08, "T": 0.06}, "feature_activation": 0.1, "top_base": "A"}, |
| 133 | + "amplify": {"p_base": {"A": 0.72, "C": 0.13, "G": 0.09, "T": 0.06}, "feature_activation": 0.5, "top_base": "A"}, |
| 134 | + "effect_size": 0.02, |
| 135 | + "narrative": "Null result: α-helix is a protein-coding feature, not relevant in a non-coding rRNA context. The model rejects the steering signal." |
| 136 | + }, |
| 137 | + "brca1_exon__exon_start": { |
| 138 | + "feature_id": 7, |
| 139 | + "feature_label": "exon_start", |
| 140 | + "suppress": {"p_base": {"A": 0.12, "C": 0.22, "G": 0.48, "T": 0.18}, "feature_activation": 0.0, "top_base": "G"}, |
| 141 | + "baseline": {"p_base": {"A": 0.08, "C": 0.22, "G": 0.58, "T": 0.12}, "feature_activation": 1.6, "top_base": "G"}, |
| 142 | + "amplify": {"p_base": {"A": 0.04, "C": 0.18, "G": 0.71, "T": 0.07}, "feature_activation": 4.2, "top_base": "G"}, |
| 143 | + "effect_size": 0.13, |
| 144 | + "narrative": "Amplifying exon_start sharpens the model's preference for the canonical G at this codon position; suppressing it flattens toward uniform exon-like noise." |
| 145 | + }, |
| 146 | + "promoter__alpha_helix": { |
| 147 | + "feature_id": 0, |
| 148 | + "feature_label": "alpha_helix", |
| 149 | + "suppress": {"p_base": {"A": 0.63, "C": 0.09, "G": 0.07, "T": 0.21}, "feature_activation": 0.0, "top_base": "A"}, |
| 150 | + "baseline": {"p_base": {"A": 0.62, "C": 0.10, "G": 0.07, "T": 0.21}, "feature_activation": 0.0, "top_base": "A"}, |
| 151 | + "amplify": {"p_base": {"A": 0.61, "C": 0.10, "G": 0.08, "T": 0.21}, "feature_activation": 0.3, "top_base": "A"}, |
| 152 | + "effect_size": 0.02, |
| 153 | + "narrative": "Null result: α-helix is protein-coding; promoters aren't coding regions. The feature doesn't fire and steering has no traction." |
| 154 | + } |
| 155 | + } |
| 156 | +} |
0 commit comments