Skip to content

Commit 5a1fe73

Browse files
committed
transformerless_lm: anti-char-cascade primitive
substrate_char_cascade: tracks char_run counter (incremented on plain-char emission, reset on word/space/newline). Once char_run >= F(3)=2, suppresses ALL char tokens by 1/phi^(pi*F(tier)). Eliminates sampling-time concat artifacts ('thouA', 'drinesa', 'mensFDoroyali'). Word_spacing helps; anti-cascade is the hard stop. Also widened unpronounceable threshold to 1/phi^3 ~ 0.236 to spare legit Shakespeare names ('northumberland', 'buckingham'). Vocab- level mask flags only 'iii' now.
1 parent 3f331c4 commit 5a1fe73

1 file changed

Lines changed: 62 additions & 1 deletion

File tree

experiments/transformerless_lm/train_self_recursive.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -702,11 +702,45 @@ def build_unpronounceable_mask(vocab: list) -> torch.Tensor:
702702
break
703703
n_vowel = sum(1 for c in tok if c in _IAMBIC_VOWELS)
704704
all_consonant = (n_vowel == 0) and (len(tok) > F[3])
705-
if max_cluster > F[5] or triple or all_consonant:
705+
# Long words with very low vowel ratio: 6+ chars, < 1/phi^3 ~ 0.236.
706+
# Eases past legit proper nouns ('northumberland' 0.29,
707+
# 'buckingham' 0.30) but flags consonant-soup tokens.
708+
low_vowel_long = (
709+
len(tok) > F[5]
710+
and (n_vowel / len(tok)) < (1.0 / (_PHI_FOR_SAMPLING ** 3))
711+
)
712+
if max_cluster > F[5] or triple or all_consonant or low_vowel_long:
706713
mask[i] = 1.0
707714
return mask
708715

709716

717+
def substrate_char_cascade(char_run: int, probs: torch.Tensor,
718+
n_chars: int) -> torch.Tensor:
719+
"""Anti-char-cascade: once F(3)=2 consecutive char tokens have been
720+
emitted (rank < n_chars), suppress further char emissions.
721+
722+
Prevents sampling-time artifacts like 'thouA', 'drinesa',
723+
'mensFDoroyali' where the model strings together raw chars after
724+
a word without spacing.
725+
726+
Exempts space (rank may be very low) and newline; both end the
727+
cascade naturally. Suppression magnitude grows by F(k) above
728+
threshold.
729+
730+
Pure substrate (F(3) threshold + char-class identification).
731+
"""
732+
if char_run < _FIB_NUMS_FOR_BIGRAM[3] or n_chars <= 0:
733+
return probs
734+
if n_chars >= probs.shape[0]:
735+
return probs
736+
excess = char_run - _FIB_NUMS_FOR_BIGRAM[3] + 1
737+
tier = min(excess, len(_FIB_NUMS_FOR_BIGRAM) - 1)
738+
penalty = 1.0 / (_PHI_FOR_SAMPLING ** (math.pi * _FIB_NUMS_FOR_BIGRAM[tier]))
739+
out = probs.clone()
740+
out[:n_chars] = out[:n_chars] * penalty
741+
return out / (out.sum() + 1e-8)
742+
743+
710744
def substrate_word_spacing(prev_tid: int, probs: torch.Tensor,
711745
vocab: list, n_chars: int = 65) -> torch.Tensor:
712746
"""After a word-token (rank >= n_chars), boost the space token to
@@ -1226,6 +1260,7 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
12261260
syl_pos = 0
12271261
open_needs = 0
12281262
cluster_len = 0
1263+
char_run = 0
12291264
if vocab is not None:
12301265
for tid in seq[0].tolist():
12311266
if tid < len(vocab):
@@ -1247,6 +1282,10 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
12471282
cluster_len += 1
12481283
else:
12491284
cluster_len = 0
1285+
if tid < n_chars_local and tok not in (' ', '\n'):
1286+
char_run += 1
1287+
else:
1288+
char_run = 0
12501289
for _ in range(n_new):
12511290
T = seq.shape[1]
12521291
ctx = seq if T <= model.seq_len else seq[:, -model.seq_len:]
@@ -1293,6 +1332,10 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
12931332
probs[0] = substrate_word_spacing(
12941333
int(seq[0, -1]), probs[0], vocab,
12951334
n_chars=n_chars_local)
1335+
# Anti-char-cascade (suppress char tokens after F(3)=2 chars).
1336+
if char_run >= _FIB_NUMS_FOR_BIGRAM[3]:
1337+
probs[0] = substrate_char_cascade(
1338+
char_run, probs[0], n_chars_local)
12961339
# Pronounceability filter (suppress impossible shapes).
12971340
if unpronounceable_mask is not None:
12981341
probs[0] = substrate_pronounceability(
@@ -1343,6 +1386,12 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
13431386
cluster_len += 1
13441387
else:
13451388
cluster_len = 0
1389+
# char_run: increment on plain char, reset on word/
1390+
# space/newline.
1391+
if nid < n_chars_local and tok not in (' ', '\n'):
1392+
char_run += 1
1393+
else:
1394+
char_run = 0
13461395
model.train()
13471396
return seq
13481397

@@ -1464,6 +1513,18 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str,
14641513
pos_probs = substrate_word_spacing(
14651514
int(new[0, t_draft - 1]), pos_probs, vocab,
14661515
n_chars=n_chars_r)
1516+
# Anti-char-cascade: compute char_run from prefix.
1517+
char_run_r = 0
1518+
for tid in new[0, :t_draft].tolist():
1519+
if tid < len(vocab):
1520+
tk_r = vocab[tid]
1521+
if tid < n_chars_r and tk_r not in (' ', '\n'):
1522+
char_run_r += 1
1523+
else:
1524+
char_run_r = 0
1525+
if char_run_r >= _FIB_NUMS_FOR_BIGRAM[3]:
1526+
pos_probs = substrate_char_cascade(
1527+
char_run_r, pos_probs, n_chars_r)
14671528
# Pronounceability filter.
14681529
if unpronounceable_mask is not None:
14691530
pos_probs = substrate_pronounceability(

0 commit comments

Comments
 (0)