Skip to content

Commit 3f331c4

Browse files
committed
transformerless_lm: relax unpronounceable mask + fix refine sigs
v66 flagged 208/500 tokens including legit 'shall', 'which', 'think' (vowel ratio threshold 1/phi^pi too tight for short Shakespeare words with 1 vowel in 5 chars). Plus staged_refine and _single_stage_refine missing the unpronounceable_mask kwarg. Fixes: - Drop vowel-ratio check; keep cluster (>F(5)=5) + triple + zero- vowel + length>F(3) all-consonant check. 1/500 flagged now ('iii'). - Thread unpronounceable_mask through both refine paths.
1 parent e607e33 commit 3f331c4

1 file changed

Lines changed: 13 additions & 14 deletions

File tree

experiments/transformerless_lm/train_self_recursive.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -668,24 +668,24 @@ def build_vowel_start_mask(vocab: list) -> torch.Tensor:
668668

669669

670670
def build_unpronounceable_mask(vocab: list) -> torch.Tensor:
671-
"""Mask = 1 for tokens with impossible-shape lettering:
672-
- max consonant cluster > F(4)=3
673-
- same letter triple (F(3)=2 repetitions of same char in a row)
674-
- vowel ratio < 1/phi^2 ~ 0.382 (too few vowels to be syllabic)
675-
Char tokens (len=1) are exempt. Pure substrate (char-class
676-
arithmetic + Fibonacci-tier thresholds).
671+
"""Mask = 1 for tokens with impossible-shape lettering.
672+
673+
Flags (any one disqualifies):
674+
- max consonant cluster > F(5)=5 (allows 'strengths', 'twelfth')
675+
- same letter triple (e.g., 'sss', 'fff', 'ttt')
676+
- zero vowels in length > F(3)=2 token (all-consonant word)
677+
678+
Char tokens (len=1) exempt. Non-alpha tokens exempt (contractions
679+
like "'tis"). Pure substrate (char-class + Fibonacci-tier).
677680
"""
678681
V = len(vocab)
679682
mask = torch.zeros(V)
680683
F = _FIB_NUMS_FOR_BIGRAM
681-
inv_phi2 = 1.0 / (_PHI_FOR_SAMPLING ** 2)
682684
for i, tok in enumerate(vocab):
683685
if not tok or len(tok) <= 1:
684686
continue
685-
# Skip if not all alphabetic (punctuation/contractions).
686687
if not all(c.isalpha() for c in tok):
687688
continue
688-
# Max consonant cluster.
689689
max_cluster = 0
690690
cur = 0
691691
for ch in tok:
@@ -695,16 +695,14 @@ def build_unpronounceable_mask(vocab: list) -> torch.Tensor:
695695
cur += 1
696696
if cur > max_cluster:
697697
max_cluster = cur
698-
# Same-letter triple.
699698
triple = False
700699
for j in range(len(tok) - 2):
701700
if tok[j] == tok[j + 1] == tok[j + 2]:
702701
triple = True
703702
break
704-
# Vowel ratio.
705703
n_vowel = sum(1 for c in tok if c in _IAMBIC_VOWELS)
706-
vowel_ratio = n_vowel / len(tok)
707-
if max_cluster > F[4] or triple or vowel_ratio < inv_phi2:
704+
all_consonant = (n_vowel == 0) and (len(tok) > F[3])
705+
if max_cluster > F[5] or triple or all_consonant:
708706
mask[i] = 1.0
709707
return mask
710708

@@ -1540,7 +1538,8 @@ def staged_refine(model, prompt, n_new, vocab_size,
15401538
vowel_start_mask: torch.Tensor = None,
15411539
end_vowels: list = None,
15421540
punct_mask: torch.Tensor = None,
1543-
newline_mask: torch.Tensor = None):
1541+
newline_mask: torch.Tensor = None,
1542+
unpronounceable_mask: torch.Tensor = None):
15441543
"""Staircase refinement: hit one score, then the next, then the next.
15451544
15461545
Stage 1: substrate alignment (minimize harmony) -- match the shape.

0 commit comments

Comments
 (0)