Skip to content

Commit 5aae99b

Browse files
committed
transformerless_lm: 3 refinements on v69 ABCD stack
1. Slower K-shrink: 2*T in scheduler lambda. K holds each tier ~2 cycles instead of 1. Addresses v69 cycle-5 K-shrink drop. 2. Bigram saturation threshold F(3)=2 -> F(4)=3. Was over- suppressing intentional repeats like "this happy breed of MEN, this LITTLE world" (legit Shakespeare repetition). 3. Strict word_spacing magnitude eased: 1/phi^pi (0.22) -> 1/phi^2 (0.38). Still encourages spacing but doesn't over-block apostrophe-internal char sequences ('tis, he's). All three target v69's observed friction points without changing the substrate canon (still F-tier thresholds, phi-bounded magnitudes).
1 parent 610cedc commit 5aae99b

1 file changed

Lines changed: 18 additions & 18 deletions

File tree

experiments/transformerless_lm/train_self_recursive.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -743,12 +743,11 @@ def substrate_char_cascade(char_run: int, probs: torch.Tensor,
743743

744744
def substrate_bigram_saturation(prev_tok: int, recent_pairs: list,
745745
probs: torch.Tensor) -> torch.Tensor:
746-
"""Penalize bigram transitions that have already fired F(3)=2+ times
747-
in the last F(7)=13 transitions. Substrate-tier exponential
748-
suppression so a 3rd same-transition fade fast, 4th faster.
749-
750-
Kills 'of this of this of of' bigram-lock loops. Pure substrate
751-
(F-tier counting + phi^pi penalty).
746+
"""Penalize bigram transitions that have already fired F(4)=3+ times
747+
in the last F(7)=13 transitions. Loosened from F(3)=2 (v69) which
748+
over-suppressed legitimate intentional repeats like 'this happy
749+
breed of MEN, this LITTLE world'. Substrate-tier exponential
750+
suppression so a 4th same-transition fades fast.
752751
"""
753752
if not recent_pairs:
754753
return probs
@@ -760,9 +759,10 @@ def substrate_bigram_saturation(prev_tok: int, recent_pairs: list,
760759
if not counts:
761760
return probs
762761
suppress = torch.ones_like(probs)
762+
threshold = F[4]
763763
for next_tok, c in counts.items():
764-
if c >= F[3]:
765-
excess = c - F[3] + 1
764+
if c >= threshold:
765+
excess = c - threshold + 1
766766
tier = min(excess, len(F) - 1)
767767
penalty = 1.0 / (_PHI_FOR_SAMPLING ** (math.pi * F[tier]))
768768
if 0 <= next_tok < probs.shape[0]:
@@ -804,18 +804,16 @@ def substrate_agreement(last_content_ends_s: bool, probs: torch.Tensor,
804804

805805
def substrate_word_spacing(prev_tid: int, probs: torch.Tensor,
806806
vocab: list, n_chars: int = 65) -> torch.Tensor:
807-
"""STRICT word boundary enforcement.
808-
809-
After a word-token (rank >= n_chars), hard-suppress every token
810-
except space, newline, and punctuation. Forces real word
811-
boundaries; eliminates 'kinightmeirface' concat.
807+
"""Word boundary enforcement with gentler suppression magnitude.
812808
813-
Suppression magnitude: 1/phi^pi ~ 0.221.
814-
Pure substrate (rank tier + char-class identification).
809+
After a word-token (rank >= n_chars), suppress every token except
810+
space, newline, and punctuation. Magnitude eased from 1/phi^pi
811+
(v69) to 1/phi^2 ~ 0.382: still strong enough to encourage
812+
spacing but doesn't over-block apostrophe-internal sequences
813+
('tis, he's, etc.).
815814
"""
816815
if prev_tid < n_chars or not vocab:
817816
return probs
818-
# Allowed-after-word: space, newline, common clause punctuation.
819817
allowed_chars = {' ', '\n', '.', ',', '!', '?', ';', ':',
820818
"'", '-'}
821819
allowed_idx = []
@@ -824,7 +822,7 @@ def substrate_word_spacing(prev_tid: int, probs: torch.Tensor,
824822
allowed_idx.append(i)
825823
if not allowed_idx:
826824
return probs
827-
suppress = 1.0 / (_PHI_FOR_SAMPLING ** math.pi)
825+
suppress = 1.0 / (_PHI_FOR_SAMPLING ** 2)
828826
mask = torch.full_like(probs, suppress)
829827
for i in allowed_idx:
830828
mask[i] = 1.0
@@ -2120,7 +2118,9 @@ def train_with_self_distillation(name, train_seed, corpus_anchor, val_split,
21202118
substrate_embed=True,
21212119
)
21222120
optimizer = FibonacciAdamW(model.parameters(), lr=args.lr)
2123-
sched = lambda s, T: K_schedule_tier_walk(s, T, K_init=args.K_init,
2121+
# Slower K-shrink: double T so K decreases at half speed (each tier
2122+
# held ~2 cycles). v69 showed K-shrink-induced drop at cycle 5.
2123+
sched = lambda s, T: K_schedule_tier_walk(s, 2 * T, K_init=args.K_init,
21242124
K_min=args.K_min)
21252125
n_params = sum(p.numel() for p in model.parameters())
21262126

0 commit comments

Comments
 (0)