Skip to content

Commit 9ddc081

Browse files
Merge pull request #3 from RandomCoder-lab/claude/find-claude-md-arn0F
transformerless_lm: cross-sentence subject threading
2 parents 343a950 + 6709e74 commit 9ddc081

43 files changed

Lines changed: 106058 additions & 11 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

experiments/transformerless_lm/activations_substrate.py

Lines changed: 453 additions & 0 deletions
Large diffs are not rendered by default.

experiments/transformerless_lm/corpus.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,19 @@ def make_dataset(seq_len: int = 64, source: str = "embedded"):
4646
fast smoke tests and the original tiny-bench)
4747
- "tinyshakespeare": load tinyshakespeare.txt (1.1 MB) — used
4848
by the scale experiment
49+
- "omc": load omc_codebase.txt (~4 MB of OMC source: .py/.rs/.md/.toml).
50+
More diverse than English prose; 210 unique chars.
4951
"""
5052
import os
5153
import torch
5254
if source == "tinyshakespeare":
5355
path = os.path.join(os.path.dirname(__file__), "tinyshakespeare.txt")
5456
with open(path, "r") as f:
5557
text = f.read()
58+
elif source == "omc":
59+
path = os.path.join(os.path.dirname(__file__), "omc_codebase.txt")
60+
with open(path, "r") as f:
61+
text = f.read()
5662
else:
5763
text = CORPUS
5864
chars = sorted(set(text))
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""Word-level tokenizer for TinyShakespeare.
2+
3+
The char-level vocab (65 chars) requires the model to learn that
4+
letters form words before it can learn word structure. Word-level
5+
tokenization gives the model atomic semantic units directly — the
6+
model's per-step prediction is a meaningful WORD, not a letter.
7+
8+
Splits on whitespace + punctuation. Keeps punctuation as separate
9+
tokens (so 'ROMEO:' becomes ['ROMEO', ':']). Lowercase'd to keep
10+
vocab small.
11+
12+
For TinyShakespeare (1.1 MB) the word vocab is roughly 25K unique
13+
tokens — much larger than 65 chars but each token carries more
14+
semantic weight per step.
15+
"""
16+
17+
import os
18+
import re
19+
20+
import torch
21+
22+
23+
_TOKEN_PATTERN = re.compile(r"[A-Za-z]+|[0-9]+|[^A-Za-z0-9\s]|\n+|\s+")
24+
25+
26+
def tokenize_text(text: str) -> list[str]:
27+
"""Split text into word-like tokens. Keeps newlines as their own
28+
tokens so the model can learn line structure."""
29+
tokens = _TOKEN_PATTERN.findall(text)
30+
# Lowercase alphabetic tokens to shrink vocab. Keep punctuation as-is.
31+
return [t.lower() if t.isalpha() else t for t in tokens]
32+
33+
34+
def make_word_dataset(source: str = "tinyshakespeare"):
35+
"""Returns (vocab, stoi, itos, encoded) for word-level tokenization.
36+
37+
vocab: list of unique tokens, sorted
38+
stoi: token -> int
39+
itos: int -> token
40+
encoded: 1-D int tensor of token ids
41+
"""
42+
base = os.path.dirname(__file__)
43+
if source == "tinyshakespeare":
44+
path = os.path.join(base, "tinyshakespeare.txt")
45+
elif source == "omc":
46+
path = os.path.join(base, "omc_codebase.txt")
47+
else:
48+
raise ValueError(f"unknown source: {source}")
49+
with open(path) as f:
50+
text = f.read()
51+
tokens = tokenize_text(text)
52+
vocab = sorted(set(tokens))
53+
stoi = {t: i for i, t in enumerate(vocab)}
54+
itos = {i: t for t, i in stoi.items()}
55+
encoded = torch.tensor([stoi[t] for t in tokens], dtype=torch.long)
56+
return vocab, stoi, itos, encoded
57+
58+
59+
def detokenize(token_ids, itos) -> str:
60+
"""Inverse of tokenize_text. Reconstructs text by joining tokens —
61+
keeps newlines/whitespace tokens visible so the line structure
62+
is preserved in the output."""
63+
out = []
64+
prev_alpha = False
65+
for tid in token_ids:
66+
t = itos[int(tid)]
67+
# Add a space between alphanumeric runs; whitespace/newline
68+
# tokens are emitted directly.
69+
if t.isalnum():
70+
if prev_alpha:
71+
out.append(" ")
72+
out.append(t)
73+
prev_alpha = True
74+
else:
75+
out.append(t)
76+
prev_alpha = False
77+
return "".join(out)
78+
79+
80+
if __name__ == "__main__":
81+
for src in ("tinyshakespeare", "omc"):
82+
vocab, stoi, itos, enc = make_word_dataset(src)
83+
print(f"{src}:")
84+
print(f" total tokens: {enc.numel():,}")
85+
print(f" unique vocab: {len(vocab):,}")
86+
sample = detokenize(enc[:30].tolist(), itos)
87+
print(f" first 30 detok: {sample!r}")
88+
print()
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
"""Shakespeare-aware creativity scoring.
2+
3+
Replaces val=CE-on-next-token (which only rewards exact reproduction)
4+
with metrics that measure whether GENERATED text is Shakespeare-LIKE
5+
without being identical:
6+
7+
- n-gram overlap: fraction of n-char windows in generated text that
8+
appear ANYWHERE in the corpus. Measures Shakespearean character
9+
patterns without exact-word requirement.
10+
- vocab overlap: fraction of generated tokens (whitespace-separated)
11+
that match corpus vocabulary. Real English/Shakespeare words even
12+
if not in the same sentence.
13+
- line structure: avg line length, ratio of letters to total chars.
14+
Captures stanza/line-break patterns.
15+
- vowel-consonant transition rate: English alternates v/c; random
16+
text doesn't. Score the alternation pattern.
17+
18+
Use these to evaluate creative output of substrate-aligned model. A
19+
model that produces statistically-Shakespearean GIBBERISH gets ~0;
20+
a model that produces creative but recognizable English gets high.
21+
"""
22+
23+
import string
24+
from collections import Counter
25+
26+
27+
28+
29+
VOWELS = set("aeiouAEIOU")
30+
LETTERS = set(string.ascii_letters)
31+
WHITESPACE = set(" \n\t")
32+
33+
34+
def char_ngram_overlap(generated: str, corpus_text: str, n: int) -> float:
35+
"""Fraction of n-char windows in generated that appear in corpus.
36+
Higher = more Shakespearean char-pattern overlap."""
37+
if len(generated) < n:
38+
return 0.0
39+
corpus_ngrams = set(corpus_text[i:i+n] for i in range(len(corpus_text) - n + 1))
40+
gen_ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)]
41+
if not gen_ngrams:
42+
return 0.0
43+
matches = sum(1 for g in gen_ngrams if g in corpus_ngrams)
44+
return matches / len(gen_ngrams)
45+
46+
47+
def vocab_overlap(generated: str, corpus_text: str) -> float:
48+
"""Fraction of generated 'words' (whitespace-split) that appear in
49+
the corpus vocabulary. Punctuation stripped for comparison."""
50+
def clean(s):
51+
return s.lower().strip(string.punctuation)
52+
corpus_vocab = set(clean(w) for w in corpus_text.split() if clean(w))
53+
gen_words = [clean(w) for w in generated.split() if clean(w)]
54+
if not gen_words:
55+
return 0.0
56+
matches = sum(1 for w in gen_words if w in corpus_vocab)
57+
return matches / len(gen_words)
58+
59+
60+
def line_structure_stats(generated: str) -> dict:
61+
"""Line-level statistics: line count, mean line length, std line
62+
length. Compare to corpus to see if the model matches Shakespeare's
63+
typical line structure."""
64+
lines = [ln for ln in generated.split("\n") if ln.strip()]
65+
if not lines:
66+
return {"n_lines": 0, "mean_line_len": 0.0, "std_line_len": 0.0}
67+
lengths = [len(ln) for ln in lines]
68+
mean = sum(lengths) / len(lengths)
69+
var = sum((L - mean) ** 2 for L in lengths) / len(lengths)
70+
return {"n_lines": len(lines),
71+
"mean_line_len": mean,
72+
"std_line_len": var ** 0.5}
73+
74+
75+
def vc_alternation_rate(generated: str) -> float:
76+
"""Vowel-consonant alternation rate. English alternates v/c more
77+
often than random text. Returns the fraction of adjacent letter
78+
pairs that are (v,c) or (c,v) -- alternating, not same class."""
79+
letters = [c for c in generated if c in LETTERS]
80+
if len(letters) < 2:
81+
return 0.0
82+
alts = 0
83+
for i in range(len(letters) - 1):
84+
a, b = letters[i] in VOWELS, letters[i+1] in VOWELS
85+
if a != b:
86+
alts += 1
87+
return alts / (len(letters) - 1)
88+
89+
90+
def line_length_match(generated: str, corpus_text: str) -> float:
91+
"""How close is the generated line-length distribution to the
92+
corpus's? L1 distance over normalized histograms (lower = closer
93+
to Shakespeare's line structure)."""
94+
def hist(text, max_len=80):
95+
lines = [ln for ln in text.split("\n") if ln.strip()]
96+
h = [0] * (max_len + 1)
97+
for ln in lines:
98+
L = min(len(ln), max_len)
99+
h[L] += 1
100+
total = sum(h) or 1
101+
return [x / total for x in h]
102+
gen_h = hist(generated)
103+
corp_h = hist(corpus_text)
104+
return sum(abs(g - c) for g, c in zip(gen_h, corp_h))
105+
106+
107+
def real_word_fraction(generated: str, corpus_text: str,
108+
min_word_len: int = 3) -> float:
109+
"""Fraction of generated 'words' that are real (length >= min_word_len
110+
AND appear in the corpus vocabulary). The strict gate against
111+
gibberish: 'fan' is real even if Shakespeare uses it, 'xqrt' is not.
112+
Short tokens (1-2 chars) excluded because they're noise-prone.
113+
"""
114+
def clean(s):
115+
return s.lower().strip(string.punctuation)
116+
corpus_vocab = set(clean(w) for w in corpus_text.split() if clean(w))
117+
gen_words = [clean(w) for w in generated.split() if clean(w)]
118+
long_words = [w for w in gen_words if len(w) >= min_word_len]
119+
if not long_words:
120+
return 0.0
121+
real = sum(1 for w in long_words if w in corpus_vocab)
122+
return real / len(long_words)
123+
124+
125+
def common_word_presence(generated: str, corpus_text: str,
126+
top_k: int = 50) -> float:
127+
"""How many of the corpus's top-K most-common words appear in the
128+
generated text. This is the strongest anti-gibberish signal:
129+
Shakespeare uses 'the', 'and', 'of', 'my', 'I' frequently;
130+
gibberish doesn't.
131+
"""
132+
def clean(s):
133+
return s.lower().strip(string.punctuation)
134+
corpus_words = [clean(w) for w in corpus_text.split() if clean(w)]
135+
corpus_freq = Counter(corpus_words)
136+
top_words = set(w for w, _ in corpus_freq.most_common(top_k))
137+
gen_words = set(clean(w) for w in generated.split() if clean(w))
138+
if not top_words:
139+
return 0.0
140+
overlap = len(gen_words & top_words)
141+
return overlap / len(top_words)
142+
143+
144+
def avg_word_length_match(generated: str, corpus_text: str) -> float:
145+
"""How close is generated avg word length to corpus avg?
146+
Returns 1.0 - normalized_distance, clamped to [0, 1]."""
147+
def clean(s):
148+
return s.lower().strip(string.punctuation)
149+
def avg(text):
150+
words = [clean(w) for w in text.split() if clean(w)]
151+
return (sum(len(w) for w in words) / len(words)) if words else 0.0
152+
g = avg(generated); c = avg(corpus_text)
153+
if c == 0:
154+
return 0.0
155+
return max(0.0, 1.0 - abs(g - c) / c)
156+
157+
158+
def ngram_diversity(generated: str, n: int = 3) -> float:
159+
"""Fraction of n-grams in the generated text that are UNIQUE.
160+
1.0 = every n-gram appears once (max diversity).
161+
0.0 = all n-grams identical (max repetition).
162+
Counter-Goodhart against the model gaming overlap by repetition."""
163+
if len(generated) < n:
164+
return 0.0
165+
ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)]
166+
if not ngrams:
167+
return 0.0
168+
return len(set(ngrams)) / len(ngrams)
169+
170+
171+
def repetition_penalty(generated: str, n: int = 4,
172+
max_freq_threshold: int = 3) -> float:
173+
"""Penalty in [0, 1] for excessive n-gram repetition. 0 = no penalty.
174+
175+
For each n-gram appearing more than max_freq_threshold times, add a
176+
penalty proportional to the excess. Strong signal against the
177+
'fan fan, fan, fan' failure mode.
178+
"""
179+
if len(generated) < n:
180+
return 0.0
181+
ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)]
182+
counts = Counter(ngrams)
183+
excess = sum(max(0, c - max_freq_threshold) for c in counts.values())
184+
# Normalize by total ngrams; cap penalty at 1.0
185+
return min(1.0, excess / max(1, len(ngrams)))
186+
187+
188+
def lexical_diversity(generated: str) -> float:
189+
"""Type-token ratio over 'words' (whitespace-split). Higher = more
190+
varied vocabulary, lower = repetitive word use."""
191+
import string as _s
192+
words = [w.lower().strip(_s.punctuation) for w in generated.split()]
193+
words = [w for w in words if w]
194+
if not words:
195+
return 0.0
196+
return len(set(words)) / len(words)
197+
198+
199+
def creativity_score(generated: str, corpus_text: str) -> dict:
200+
"""Comprehensive Shakespeare-creativity score with anti-gibberish.
201+
202+
Penalties added in v2 to counter Goodhart's failure (model gaming
203+
overlap metrics by repetition):
204+
- ngram_diversity (multiplier; low = repetitive output)
205+
- lexical_diversity (multiplier; low = same word over and over)
206+
- repetition_penalty (subtractive; n-gram appears too many times)
207+
"""
208+
n2 = char_ngram_overlap(generated, corpus_text, 2)
209+
n3 = char_ngram_overlap(generated, corpus_text, 3)
210+
n4 = char_ngram_overlap(generated, corpus_text, 4)
211+
vocab = vocab_overlap(generated, corpus_text)
212+
vc = vc_alternation_rate(generated)
213+
line_dist = line_length_match(generated, corpus_text)
214+
line_stats = line_structure_stats(generated)
215+
# Strong anti-gibberish: common-word, real-word, and word-length.
216+
cw = common_word_presence(generated, corpus_text, top_k=50)
217+
rw = real_word_fraction(generated, corpus_text, min_word_len=3)
218+
awl = avg_word_length_match(generated, corpus_text)
219+
# Repetition penalty: only severe excess counts now (threshold scales
220+
# with text length so real text's natural repetition doesn't penalize).
221+
threshold = max(2, len(generated) // 50)
222+
rep_pen = repetition_penalty(generated, n=4, max_freq_threshold=threshold)
223+
224+
composite = (
225+
0.25 * rw + # real-word fraction (HARDEST anti-gibberish)
226+
0.15 * cw + # common-word presence
227+
0.15 * vocab + # any vocab overlap (short tokens count)
228+
0.10 * awl + # word-length sanity
229+
0.15 * n3 + # 3-gram match (corpus patterns)
230+
0.10 * n4 + # 4-gram match (longer patterns)
231+
0.10 * max(0.0, 1.0 - line_dist) # line structure
232+
) - 0.3 * rep_pen
233+
composite = max(0.0, min(1.0, composite))
234+
return {
235+
"ngram_2": n2,
236+
"ngram_3": n3,
237+
"ngram_4": n4,
238+
"vocab_overlap": vocab,
239+
"common_word_presence": cw,
240+
"real_word_fraction": rw,
241+
"avg_word_len_match": awl,
242+
"vc_alternation": vc,
243+
"line_dist": line_dist,
244+
"line_stats": line_stats,
245+
"repetition_penalty": rep_pen,
246+
"creativity_score": composite,
247+
}

0 commit comments

Comments
 (0)