@@ -499,6 +499,143 @@ def substrate_vocab_curriculum(probs: torch.Tensor,
499499_IAMBIC_VOWELS = set ("aeiouAEIOU" )
500500
501501
502+ def _token_morphology (tok : str ) -> str :
503+ """Universal morphology class from suffix (no English word lists).
504+ char | verb_archaic | gerund | past | adverb | plural | root.
505+ """
506+ if not tok or len (tok ) <= 1 :
507+ return 'char'
508+ if tok .endswith ('eth' ) or tok .endswith ('est' ):
509+ return 'verb_archaic'
510+ if tok .endswith ('ing' ):
511+ return 'gerund'
512+ if tok .endswith ('ed' ):
513+ return 'past'
514+ if tok .endswith ('ly' ):
515+ return 'adverb'
516+ if tok .endswith ('s' ) and len (tok ) > 2 :
517+ return 'plural'
518+ return 'root'
519+
520+
521+ def build_symbol_classes (vocab : list , n_chars : int = 65 ) -> tuple :
522+ """Each token's class = (rank_tier, morphology). Rank-tier is the
523+ Fibonacci-walk band the token's rank falls into (within the word
524+ region). Chars get their own tier. Morphology from suffix.
525+ Pure substrate (F-tier + suffix shape, no word lists).
526+
527+ Returns (class_id_tensor[V], n_classes).
528+ """
529+ F = _FIB_NUMS_FOR_BIGRAM # [1,1,2,3,5,8,13,21,34,55,89,144]
530+ cum_tiers = []
531+ cum = n_chars
532+ for f in F :
533+ cum += f
534+ cum_tiers .append (cum )
535+
536+ def rank_tier (i : int ) -> int :
537+ if i < n_chars :
538+ return - 1
539+ for ti , ct in enumerate (cum_tiers ):
540+ if i < ct :
541+ return ti
542+ return len (cum_tiers )
543+
544+ morphs = ['char' , 'verb_archaic' , 'gerund' , 'past' ,
545+ 'adverb' , 'plural' , 'root' ]
546+ morph_to_idx = {m : i for i , m in enumerate (morphs )}
547+ n_morph = len (morphs )
548+ # Class id = (tier + 1) * n_morph + morph_idx
549+ class_ids = []
550+ for i , tok in enumerate (vocab ):
551+ tier = rank_tier (i )
552+ m = _token_morphology (tok )
553+ cid = (tier + 1 ) * n_morph + morph_to_idx [m ]
554+ class_ids .append (cid )
555+ class_id_tensor = torch .tensor (class_ids , dtype = torch .long )
556+ n_classes = int (class_id_tensor .max ().item ()) + 1
557+ return class_id_tensor , n_classes
558+
559+
560+ def substrate_symbolic_substitution (probs : torch .Tensor ,
561+ class_id_tensor : torch .Tensor ,
562+ n_classes : int ,
563+ alpha : float = None ) -> torch .Tensor :
564+ """Smooth probability mass within symbol equivalence classes.
565+
566+ Per class: redistribute alpha-fraction of mass uniformly across
567+ siblings; keep (1-alpha) at the original spike. Variety without
568+ breaking grammar -- tokens in the same (rank-tier, morphology)
569+ class are mutually substitutable.
570+
571+ alpha defaults to 1/phi^pi (substrate-canonical, ~0.221).
572+ """
573+ if alpha is None :
574+ alpha = 1.0 / (_PHI_FOR_SAMPLING ** math .pi )
575+ cids = class_id_tensor .to (probs .device )
576+ class_totals = torch .zeros (n_classes , dtype = probs .dtype ,
577+ device = probs .device )
578+ class_totals .scatter_add_ (0 , cids , probs )
579+ counts = torch .zeros (n_classes , dtype = probs .dtype ,
580+ device = probs .device )
581+ counts .scatter_add_ (0 , cids , torch .ones_like (probs ))
582+ counts .clamp_ (min = 1.0 )
583+ uniform_per_class = class_totals / counts
584+ uniform_per_token = uniform_per_class [cids ]
585+ out = (1.0 - alpha ) * probs + alpha * uniform_per_token
586+ return out / (out .sum () + 1e-8 )
587+
588+
589+ def build_pronoun_mask (vocab : list ) -> torch .Tensor :
590+ """Identify pronoun-shape tokens: low rank + monosyllabic + no suffix.
591+ Pure substrate (rank + syllable + morphology shape).
592+ """
593+ V = len (vocab )
594+ mask = torch .zeros (V )
595+ for i , tok in enumerate (vocab ):
596+ if not tok or len (tok ) == 1 :
597+ continue
598+ is_low_rank = i < 78 # 65 chars + F(7)=13 most common words
599+ no_suffix = _token_morphology (tok ) == 'root'
600+ is_monosyl = _approx_syllables (tok ) == 1
601+ if is_low_rank and no_suffix and is_monosyl :
602+ mask [i ] = 1.0
603+ return mask
604+
605+
606+ def substrate_reference_chain (recent_tokens : list ,
607+ pronoun_mask : torch .Tensor ,
608+ probs : torch .Tensor ,
609+ n_chars : int = 65 ) -> torch .Tensor :
610+ """Anaphora: boost pronoun-shape tokens when recent content tokens
611+ create open reference slots.
612+
613+ Pressure = sum_k F(k)/phi^(pi*k) over recent CONTENT tokens at
614+ distance k (k=0 most-recent). Bounded log-boost = log(phi) *
615+ pressure / (1 + pressure). Pure substrate (F-decay + rank-tier).
616+ """
617+ if not recent_tokens :
618+ return probs
619+ phi = _PHI_FOR_SAMPLING
620+ phi_pi = phi ** math .pi
621+ content_thresh = n_chars + _FIB_NUMS_FOR_BIGRAM [7 ] # 65 + 13 = 78
622+ pressure = 0.0
623+ for i , tid in enumerate (reversed (recent_tokens )):
624+ if i >= 13 :
625+ break
626+ if tid > content_thresh :
627+ k = min (i , len (_FIB_NUMS_FOR_BIGRAM ) - 1 )
628+ pressure += _FIB_NUMS_FOR_BIGRAM [k ] / (phi_pi ** k )
629+ if pressure <= 0 :
630+ return probs
631+ log_boost = math .log (phi ) * pressure / (1.0 + pressure )
632+ boost_factor = math .exp (log_boost )
633+ pmask = pronoun_mask .to (probs .device ).to (probs .dtype )
634+ boost = 1.0 + pmask * (boost_factor - 1.0 )
635+ out = probs * boost
636+ return out / (out .sum () + 1e-8 )
637+
638+
502639def _approx_syllables (tok_str : str ) -> int :
503640 """Approximate syllable count = number of vowel-clusters.
504641 Pure substrate (char-class arithmetic). Min 1 for non-empty tokens.
@@ -807,7 +944,10 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
807944 bigram_prior : torch .Tensor = None ,
808945 vocab : list = None ,
809946 token_signatures : torch .Tensor = None ,
810- active_vocab_size : int = None ):
947+ active_vocab_size : int = None ,
948+ class_id_tensor : torch .Tensor = None ,
949+ n_classes : int = 0 ,
950+ pronoun_mask : torch .Tensor = None ):
811951 """Sample n_new tokens autoregressively with substrate sampling AND
812952 a substrate-canonical recency penalty.
813953
@@ -847,6 +987,15 @@ def autoregressive_generate(model, prompt: torch.Tensor, n_new: int,
847987 # Iambic stress rhythm (period-2 weak/STRONG alternation).
848988 probs [0 ] = substrate_iambic_phase (
849989 syl_pos , probs [0 ], vocab_size )
990+ # Symbolic substitution (within-class mass smoothing).
991+ if class_id_tensor is not None and n_classes > 0 :
992+ probs [0 ] = substrate_symbolic_substitution (
993+ probs [0 ], class_id_tensor , n_classes )
994+ # Symbolic reference chain (pronoun anaphora).
995+ if pronoun_mask is not None and seq .shape [1 ] >= 1 :
996+ recent_list = seq [0 , - 13 :].tolist ()
997+ probs [0 ] = substrate_reference_chain (
998+ recent_list , pronoun_mask , probs [0 ])
850999 # Theme momentum (subject-matter coherence).
8511000 if token_signatures is not None and seq .shape [1 ] >= 1 :
8521001 recent_list = seq [0 , - 13 :].tolist ()
@@ -888,7 +1037,10 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str,
8881037 bigram_prior : torch .Tensor = None ,
8891038 vocab : list = None ,
8901039 token_signatures : torch .Tensor = None ,
891- active_vocab_size : int = None ):
1040+ active_vocab_size : int = None ,
1041+ class_id_tensor : torch .Tensor = None ,
1042+ n_classes : int = 0 ,
1043+ pronoun_mask : torch .Tensor = None ):
8921044 """One refinement stage: optimize a single score until plateau.
8931045
8941046 mode: 'min' (harmony, quality) or 'max' (creativity).
@@ -947,6 +1099,16 @@ def _single_stage_refine(model, draft, vocab_size, scorer, mode: str,
9471099 syl_pos += _approx_syllables (vocab [tid ])
9481100 pos_probs = substrate_iambic_phase (
9491101 syl_pos , pos_probs , vocab_size_local )
1102+ # Symbolic substitution (within-class smoothing).
1103+ if class_id_tensor is not None and n_classes > 0 :
1104+ pos_probs = substrate_symbolic_substitution (
1105+ pos_probs , class_id_tensor , n_classes )
1106+ # Symbolic reference chain (pronoun anaphora).
1107+ if pronoun_mask is not None and t_draft >= 1 :
1108+ recent_start = max (0 , t_draft - 13 )
1109+ recent_list = new [0 , recent_start :t_draft ].tolist ()
1110+ pos_probs = substrate_reference_chain (
1111+ recent_list , pronoun_mask , pos_probs )
9501112 # Theme momentum (subject-matter coherence).
9511113 if token_signatures is not None and t_draft >= 1 :
9521114 recent_start = max (0 , t_draft - 13 )
@@ -1004,7 +1166,10 @@ def staged_refine(model, prompt, n_new, vocab_size,
10041166 bigram_prior : torch .Tensor = None ,
10051167 vocab : list = None ,
10061168 token_signatures : torch .Tensor = None ,
1007- active_vocab_size : int = None ):
1169+ active_vocab_size : int = None ,
1170+ class_id_tensor : torch .Tensor = None ,
1171+ n_classes : int = 0 ,
1172+ pronoun_mask : torch .Tensor = None ):
10081173 """Staircase refinement: hit one score, then the next, then the next.
10091174
10101175 Stage 1: substrate alignment (minimize harmony) -- match the shape.
@@ -1020,7 +1185,7 @@ def staged_refine(model, prompt, n_new, vocab_size,
10201185 with torch .no_grad ():
10211186 draft = autoregressive_generate (model , prompt , n_new = n_new ,
10221187 vocab_size = vocab_size ,
1023- temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1188+ temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
10241189 stages_out = {}
10251190 stages_out ["initial" ] = {"seq" : draft .clone (),
10261191 "harmony" : harmony_scorer (draft ),
@@ -1033,7 +1198,7 @@ def staged_refine(model, prompt, n_new, vocab_size,
10331198 n_iters = n_iters_per_stage ,
10341199 resample_frac = resample_frac ,
10351200 prompt_len = prompt_len ,
1036- temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1201+ temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
10371202 stages_out ["after_harmony" ] = {"seq" : draft .clone (),
10381203 "trajectory" : h_traj ,
10391204 "harmony" : harmony_scorer (draft ),
@@ -1046,7 +1211,7 @@ def staged_refine(model, prompt, n_new, vocab_size,
10461211 n_iters = n_iters_per_stage ,
10471212 resample_frac = resample_frac ,
10481213 prompt_len = prompt_len ,
1049- temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1214+ temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
10501215 stages_out ["after_quality" ] = {"seq" : draft .clone (),
10511216 "trajectory" : q_traj ,
10521217 "harmony" : harmony_scorer (draft ),
@@ -1060,7 +1225,7 @@ def staged_refine(model, prompt, n_new, vocab_size,
10601225 n_iters = n_iters_per_stage ,
10611226 resample_frac = resample_frac ,
10621227 prompt_len = prompt_len ,
1063- temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1228+ temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
10641229 stages_out ["after_creativity" ] = {"seq" : draft .clone (),
10651230 "trajectory" : c_traj ,
10661231 "harmony" : harmony_scorer (draft ),
@@ -1094,7 +1259,7 @@ def iterative_refine(model, prompt, n_new, vocab_size,
10941259 # Step 1: initial draft.
10951260 draft = autoregressive_generate (model , prompt , n_new = n_new ,
10961261 vocab_size = vocab_size ,
1097- temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1262+ temperature = temperature , bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
10981263 history = []
10991264 h0 = harmony_scorer (draft ) if harmony_scorer is not None else None
11001265 q0 = quality_scorer (draft ) if quality_scorer is not None else None
@@ -1489,12 +1654,22 @@ def quality_fn(seq_tokens):
14891654 bigram_prior = build_substrate_bigram (vocab_size )
14901655 print (f" refined substrate bigram (shape * POS): { bigram_prior .shape } " )
14911656
1492- # Substrate token signatures (theme momentum) -- F-frequency cos basis
1493- # over char codes, phi-decayed across positions. L2-normalized.
1494- # NOTE: v57 showed theme momentum drags mean creativity ~-0.01.
1495- # Disabled for v59 to isolate iambic + threading.
1657+ # Theme momentum disabled (v57 showed it drags ~-0.01).
14961658 token_signatures = None
14971659
1660+ # Symbolic primitives (v60+): equivalence classes + reference chain.
1661+ if vocab_for_bigram is not None :
1662+ n_chars_local = sum (1 for t in vocab_for_bigram if len (t ) == 1 )
1663+ class_id_tensor , n_classes = build_symbol_classes (
1664+ vocab_for_bigram , n_chars = n_chars_local )
1665+ pronoun_mask = build_pronoun_mask (vocab_for_bigram )
1666+ print (f" symbol classes: { n_classes } | "
1667+ f"pronoun candidates: { int (pronoun_mask .sum ().item ())} " )
1668+ else :
1669+ class_id_tensor = None
1670+ n_classes = 0
1671+ pronoun_mask = None
1672+
14981673 # Active training base: starts as tiny_seed, GROWS by appending each
14991674 # cycle's best refined output -- only if (a) creativity > corpus
15001675 # baseline AND (b) anchor weight constraint still satisfied.
@@ -1561,14 +1736,14 @@ def quality_fn(seq_tokens):
15611736 draft = autoregressive_generate (
15621737 model , prompt_s , n_new = growth_n_new ,
15631738 vocab_size = vocab_size , temperature = 0.8 ,
1564- bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1739+ bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
15651740 refined_s , _ = staged_refine (
15661741 model , prompt_s , n_new = growth_n_new , vocab_size = vocab_size ,
15671742 harmony_scorer = harmony_fn , quality_scorer = quality_fn ,
15681743 creativity_scorer = creativity_fn ,
15691744 n_iters_per_stage = 30 , resample_frac = 0.35 ,
15701745 prompt_len = 16 , temperature = 0.5 ,
1571- bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1746+ bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
15721747 samples .append ((refined_s .squeeze (0 ).clone (),
15731748 creativity_fn (refined_s )))
15741749 # Sort by creativity desc, keep top K.
@@ -1638,14 +1813,14 @@ def quality_fn(seq_tokens):
16381813 final_gen = autoregressive_generate (model , prompt , n_new = n_new ,
16391814 vocab_size = vocab_size ,
16401815 temperature = 0.8 ,
1641- bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1816+ bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
16421817 final_refined , _ = staged_refine (
16431818 model , prompt , n_new = n_new , vocab_size = vocab_size ,
16441819 harmony_scorer = harmony_fn , quality_scorer = quality_fn ,
16451820 creativity_scorer = creativity_fn ,
16461821 n_iters_per_stage = 200 , resample_frac = 0.35 ,
16471822 prompt_len = 16 , temperature = 0.5 ,
1648- bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size )
1823+ bigram_prior = bigram_prior , vocab = vocab , token_signatures = token_signatures , active_vocab_size = active_vocab_size , class_id_tensor = class_id_tensor , n_classes = n_classes , pronoun_mask = pronoun_mask )
16491824
16501825 return {"name" : name , "mode" : "self_distillation" ,
16511826 "n_params" : n_params ,
0 commit comments