@@ -743,12 +743,11 @@ def substrate_char_cascade(char_run: int, probs: torch.Tensor,
743743
744744def substrate_bigram_saturation (prev_tok : int , recent_pairs : list ,
745745 probs : torch .Tensor ) -> torch .Tensor :
746- """Penalize bigram transitions that have already fired F(3)=2+ times
747- in the last F(7)=13 transitions. Substrate-tier exponential
748- suppression so a 3rd same-transition fade fast, 4th faster.
749-
750- Kills 'of this of this of of' bigram-lock loops. Pure substrate
751- (F-tier counting + phi^pi penalty).
746+ """Penalize bigram transitions that have already fired F(4)=3+ times
747+ in the last F(7)=13 transitions. Loosened from F(3)=2 (v69) which
748+ over-suppressed legitimate intentional repeats like 'this happy
749+ breed of MEN, this LITTLE world'. Substrate-tier exponential
750+ suppression so a 4th same-transition fades fast.
752751 """
753752 if not recent_pairs :
754753 return probs
@@ -760,9 +759,10 @@ def substrate_bigram_saturation(prev_tok: int, recent_pairs: list,
760759 if not counts :
761760 return probs
762761 suppress = torch .ones_like (probs )
762+ threshold = F [4 ]
763763 for next_tok , c in counts .items ():
764- if c >= F [ 3 ] :
765- excess = c - F [ 3 ] + 1
764+ if c >= threshold :
765+ excess = c - threshold + 1
766766 tier = min (excess , len (F ) - 1 )
767767 penalty = 1.0 / (_PHI_FOR_SAMPLING ** (math .pi * F [tier ]))
768768 if 0 <= next_tok < probs .shape [0 ]:
@@ -804,18 +804,16 @@ def substrate_agreement(last_content_ends_s: bool, probs: torch.Tensor,
804804
805805def substrate_word_spacing (prev_tid : int , probs : torch .Tensor ,
806806 vocab : list , n_chars : int = 65 ) -> torch .Tensor :
807- """STRICT word boundary enforcement.
808-
809- After a word-token (rank >= n_chars), hard-suppress every token
810- except space, newline, and punctuation. Forces real word
811- boundaries; eliminates 'kinightmeirface' concat.
807+ """Word boundary enforcement with gentler suppression magnitude.
812808
813- Suppression magnitude: 1/phi^pi ~ 0.221.
814- Pure substrate (rank tier + char-class identification).
809+ After a word-token (rank >= n_chars), suppress every token except
810+ space, newline, and punctuation. Magnitude eased from 1/phi^pi
811+ (v69) to 1/phi^2 ~ 0.382: still strong enough to encourage
812+ spacing but doesn't over-block apostrophe-internal sequences
813+ ('tis, he's, etc.).
815814 """
816815 if prev_tid < n_chars or not vocab :
817816 return probs
818- # Allowed-after-word: space, newline, common clause punctuation.
819817 allowed_chars = {' ' , '\n ' , '.' , ',' , '!' , '?' , ';' , ':' ,
820818 "'" , '-' }
821819 allowed_idx = []
@@ -824,7 +822,7 @@ def substrate_word_spacing(prev_tid: int, probs: torch.Tensor,
824822 allowed_idx .append (i )
825823 if not allowed_idx :
826824 return probs
827- suppress = 1.0 / (_PHI_FOR_SAMPLING ** math . pi )
825+ suppress = 1.0 / (_PHI_FOR_SAMPLING ** 2 )
828826 mask = torch .full_like (probs , suppress )
829827 for i in allowed_idx :
830828 mask [i ] = 1.0
@@ -2120,7 +2118,9 @@ def train_with_self_distillation(name, train_seed, corpus_anchor, val_split,
21202118 substrate_embed = True ,
21212119 )
21222120 optimizer = FibonacciAdamW (model .parameters (), lr = args .lr )
2123- sched = lambda s , T : K_schedule_tier_walk (s , T , K_init = args .K_init ,
2121+ # Slower K-shrink: double T so K decreases at half speed (each tier
2122+ # held ~2 cycles). v69 showed K-shrink-induced drop at cycle 5.
2123+ sched = lambda s , T : K_schedule_tier_walk (s , 2 * T , K_init = args .K_init ,
21242124 K_min = args .K_min )
21252125 n_params = sum (p .numel () for p in model .parameters ())
21262126
0 commit comments