@@ -702,11 +702,45 @@ def build_unpronounceable_mask(vocab: list) -> torch.Tensor:
702702 break
703703 n_vowel = sum (1 for c in tok if c in _IAMBIC_VOWELS )
704704 all_consonant = (n_vowel == 0 ) and (len (tok ) > F [3 ])
705- if max_cluster > F [5 ] or triple or all_consonant :
705+ # Long words with very low vowel ratio: 6+ chars, < 1/phi^3 ~ 0.236.
706+ # Eases past legit proper nouns ('northumberland' 0.29,
707+ # 'buckingham' 0.30) but flags consonant-soup tokens.
708+ low_vowel_long = (
709+ len (tok ) > F [5 ]
710+ and (n_vowel / len (tok )) < (1.0 / (_PHI_FOR_SAMPLING ** 3 ))
711+ )
712+ if max_cluster > F [5 ] or triple or all_consonant or low_vowel_long :
706713 mask [i ] = 1.0
707714 return mask
708715
709716
717+ def substrate_char_cascade (char_run : int , probs : torch .Tensor ,
718+ n_chars : int ) -> torch .Tensor :
719+ """Anti-char-cascade: once F(3)=2 consecutive char tokens have been
720+ emitted (rank < n_chars), suppress further char emissions.
721+
722+ Prevents sampling-time artifacts like 'thouA', 'drinesa',
723+ 'mensFDoroyali' where the model strings together raw chars after
724+ a word without spacing.
725+
726+ Exempts space (rank may be very low) and newline; both end the
727+ cascade naturally. Suppression magnitude grows by F(k) above
728+ threshold.
729+
730+ Pure substrate (F(3) threshold + char-class identification).
731+ """
732+ if char_run < _FIB_NUMS_FOR_BIGRAM [3 ] or n_chars <= 0 :
733+ return probs
734+ if n_chars >= probs .shape [0 ]:
735+ return probs
736+ excess = char_run - _FIB_NUMS_FOR_BIGRAM [3 ] + 1
737+ tier = min (excess , len (_FIB_NUMS_FOR_BIGRAM ) - 1 )
738+ penalty = 1.0 / (_PHI_FOR_SAMPLING ** (math .pi * _FIB_NUMS_FOR_BIGRAM [tier ]))
739+ out = probs .clone ()
740+ out [:n_chars ] = out [:n_chars ] * penalty
741+ return out / (out .sum () + 1e-8 )
742+
743+
710744def substrate_word_spacing (prev_tid : int , probs : torch .Tensor ,
711745 vocab : list , n_chars : int = 65 ) -> torch .Tensor :
712746 """After a word-token (rank >= n_chars), boost the space token to
@@ -1226,6 +1260,7 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
12261260 syl_pos = 0
12271261 open_needs = 0
12281262 cluster_len = 0
1263+ char_run = 0
12291264 if vocab is not None :
12301265 for tid in seq [0 ].tolist ():
12311266 if tid < len (vocab ):
@@ -1247,6 +1282,10 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
12471282 cluster_len += 1
12481283 else :
12491284 cluster_len = 0
1285+ if tid < n_chars_local and tok not in (' ' , '\n ' ):
1286+ char_run += 1
1287+ else :
1288+ char_run = 0
12501289 for _ in range (n_new ):
12511290 T = seq .shape [1 ]
12521291 ctx = seq if T <= model .seq_len else seq [:, - model .seq_len :]
@@ -1293,6 +1332,10 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
12931332 probs [0 ] = substrate_word_spacing (
12941333 int (seq [0 , - 1 ]), probs [0 ], vocab ,
12951334 n_chars = n_chars_local )
1335+ # Anti-char-cascade (suppress char tokens after F(3)=2 chars).
1336+ if char_run >= _FIB_NUMS_FOR_BIGRAM [3 ]:
1337+ probs [0 ] = substrate_char_cascade (
1338+ char_run , probs [0 ], n_chars_local )
12961339 # Pronounceability filter (suppress impossible shapes).
12971340 if unpronounceable_mask is not None :
12981341 probs [0 ] = substrate_pronounceability (
@@ -1343,6 +1386,12 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
13431386 cluster_len += 1
13441387 else :
13451388 cluster_len = 0
1389+ # char_run: increment on plain char, reset on word/
1390+ # space/newline.
1391+ if nid < n_chars_local and tok not in (' ' , '\n ' ):
1392+ char_run += 1
1393+ else :
1394+ char_run = 0
13461395 model .train ()
13471396 return seq
13481397
@@ -1464,6 +1513,18 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str,
14641513 pos_probs = substrate_word_spacing (
14651514 int (new [0 , t_draft - 1 ]), pos_probs , vocab ,
14661515 n_chars = n_chars_r )
1516+ # Anti-char-cascade: compute char_run from prefix.
1517+ char_run_r = 0
1518+ for tid in new [0 , :t_draft ].tolist ():
1519+ if tid < len (vocab ):
1520+ tk_r = vocab [tid ]
1521+ if tid < n_chars_r and tk_r not in (' ' , '\n ' ):
1522+ char_run_r += 1
1523+ else :
1524+ char_run_r = 0
1525+ if char_run_r >= _FIB_NUMS_FOR_BIGRAM [3 ]:
1526+ pos_probs = substrate_char_cascade (
1527+ char_run_r , pos_probs , n_chars_r )
14671528 # Pronounceability filter.
14681529 if unpronounceable_mask is not None :
14691530 pos_probs = substrate_pronounceability (
0 commit comments