Skip to content

Commit f197533

Browse files
tokenizer: +110 dict entries (workflow idioms, partials, sigs)
Common multi-token shapes that show up across OMC test suites and LLM-driven sessions: variable-init patterns (h sum = arr_sum_int(, h xs = [];, h result = null), loop bodies ( arr_push(result, ), function-sig openers (fn forward(, fn backward(, fn predict(), test patterns (test_record_failure(msg + ", concat_many(msg, ), math idioms (sqrt(x * x + , exp(0 - ), substrate composition (arr_fold_all(arr_resonance_vec(), autograd init (h W = tape_var(, tape_backward(L);), Python bridge (py_import(, py_call_method(), arr-slice / dict-get-or / type-of patterns. Builtin-heavy code compresses more aggressively; commonly-paired phrases collapse into single IDs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent d3e25c3 commit f197533

1 file changed

Lines changed: 112 additions & 0 deletions

File tree

omnimcode-core/src/tokenizer.rs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,118 @@ pub const TOKEN_DICT: &[&str] = &[
820820
" return ",
821821
" h ",
822822
" h ",
823+
824+
// ---- Phrase dict expansion v3 (workflow idioms) ----
825+
"h sum = arr_sum_int(",
826+
"h len = arr_len(",
827+
"h size = dict_size(",
828+
"h key = arr_get(",
829+
"h value = dict_get(",
830+
"h first = arr_get(xs, 0)",
831+
"h last = arr_get(xs, arr_len(xs) - 1)",
832+
"h half = arr_len(xs) / 2",
833+
"h mid = (a + b) / 2",
834+
" h v = arr_get(",
835+
" h k = arr_get(",
836+
" h cur = arr_get(",
837+
" arr_push(result, ",
838+
" arr_push(acc, ",
839+
" sum = sum + ",
840+
" count = count + 1",
841+
" if cond {",
842+
" return ",
843+
"fn main()",
844+
"fn init(",
845+
"fn step(",
846+
"fn forward(",
847+
"fn backward(",
848+
"fn predict(",
849+
"fn train(",
850+
"fn evaluate(",
851+
"fn process(",
852+
"fn parse(",
853+
"fn format(",
854+
"fn serialize(",
855+
"fn deserialize(",
856+
"fn create(",
857+
"fn destroy(",
858+
"h err = \"\"",
859+
"h ok = 1",
860+
"h fail = 0",
861+
"h result = null",
862+
"json_parse(read_file(",
863+
"json_stringify(",
864+
"write_file(path, json_stringify(",
865+
"test_record_failure(msg)",
866+
"test_record_failure(msg + \"",
867+
"if expected != actual",
868+
"if !cond {",
869+
"if !approx_eq(",
870+
"test_record_failure(concat_many(",
871+
"pow(2, ",
872+
"sqrt(x * x + ",
873+
"log(arr_get(",
874+
"exp(0 - ",
875+
"abs(diff)",
876+
"max(a, b)",
877+
"min(a, b)",
878+
"is_attractor(arr_get(",
879+
"arr_fold_all(arr_resonance_vec(",
880+
"harmony(arr_sum_int(",
881+
"arr_resonance_vec(arr_fold_all(",
882+
"h W = tape_var(",
883+
"h b = tape_var(",
884+
"h X = tape_var([[",
885+
"h Z = tape_matmul(",
886+
"h Y = tape_relu(",
887+
"h L = tape_mean(",
888+
"tape_backward(L);",
889+
"tape_update(W, ",
890+
"h dW = tape_grad(",
891+
"py_import(",
892+
"py_call(",
893+
"py_call_method(",
894+
"py_callback(",
895+
"py_get(",
896+
"py_set(",
897+
"} else if ",
898+
"} elif ",
899+
"if found { break; }",
900+
"if found == 0 {",
901+
"return found;",
902+
"return result;",
903+
"return out;",
904+
"return acc;",
905+
"concat_many(\"",
906+
"concat_many(msg, ",
907+
"to_string(arr_len(",
908+
"to_string(dict_size(",
909+
"to_string(arr_get(",
910+
"if type_of(v) == \"",
911+
"if v == null",
912+
"if v != null",
913+
"arr_slice(xs, 0, ",
914+
"arr_slice(xs, i, ",
915+
"arr_take(xs, ",
916+
"arr_drop(xs, ",
917+
"dict_get_or(d, ",
918+
"# ----",
919+
"# ====",
920+
"# ---- ",
921+
"## ---",
922+
"assert_eq(arr_get(",
923+
"assert_true(arr_get(",
924+
"assert_true(dict_has(",
925+
"assert_eq(dict_size(",
926+
"assert_eq(arr_len(",
927+
"assert_eq(str_len(",
928+
"arr_resonance_vec([",
929+
"arr_him_vec([",
930+
"arr_fold_all([",
931+
"is_attractor(",
932+
"attractor_distance(",
933+
"fibonacci_index(",
934+
"crt_recover([",
823935
];
824936

825937
/// Substrate distance between two token IDs. Returns the absolute

0 commit comments

Comments
 (0)