tokenizer: +110 dict entries (workflow idioms, partials, sigs)

RandomCoder-lab · claude · RandomCoder-lab · commit f197533ce540 · 2026-05-16T12:06:57.000-05:00
Common multi-token shapes that show up across OMC test suites and
LLM-driven sessions: variable-init patterns (h sum = arr_sum_int(,
h xs = [];, h result = null), loop bodies (    arr_push(result, ),
function-sig openers (fn forward(, fn backward(, fn predict(),
test patterns (test_record_failure(msg + ", concat_many(msg, ),
math idioms (sqrt(x * x + , exp(0 - ), substrate composition
(arr_fold_all(arr_resonance_vec(), autograd init (h W = tape_var(,
tape_backward(L);), Python bridge (py_import(, py_call_method(),
arr-slice / dict-get-or / type-of patterns.

Builtin-heavy code compresses more aggressively; commonly-paired
phrases collapse into single IDs.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/omnimcode-core/src/tokenizer.rs b/omnimcode-core/src/tokenizer.rs
@@ -820,6 +820,118 @@ pub const TOKEN_DICT: &[&str] = &[
     "        return ",
     "    h ",
     "        h ",
+
+    // ---- Phrase dict expansion v3 (workflow idioms) ----
+    "h sum = arr_sum_int(",
+    "h len = arr_len(",
+    "h size = dict_size(",
+    "h key = arr_get(",
+    "h value = dict_get(",
+    "h first = arr_get(xs, 0)",
+    "h last = arr_get(xs, arr_len(xs) - 1)",
+    "h half = arr_len(xs) / 2",
+    "h mid = (a + b) / 2",
+    "    h v = arr_get(",
+    "    h k = arr_get(",
+    "    h cur = arr_get(",
+    "    arr_push(result, ",
+    "    arr_push(acc, ",
+    "    sum = sum + ",
+    "    count = count + 1",
+    "    if cond {",
+    "        return ",
+    "fn main()",
+    "fn init(",
+    "fn step(",
+    "fn forward(",
+    "fn backward(",
+    "fn predict(",
+    "fn train(",
+    "fn evaluate(",
+    "fn process(",
+    "fn parse(",
+    "fn format(",
+    "fn serialize(",
+    "fn deserialize(",
+    "fn create(",
+    "fn destroy(",
+    "h err = \"\"",
+    "h ok = 1",
+    "h fail = 0",
+    "h result = null",
+    "json_parse(read_file(",
+    "json_stringify(",
+    "write_file(path, json_stringify(",
+    "test_record_failure(msg)",
+    "test_record_failure(msg + \"",
+    "if expected != actual",
+    "if !cond {",
+    "if !approx_eq(",
+    "test_record_failure(concat_many(",
+    "pow(2, ",
+    "sqrt(x * x + ",
+    "log(arr_get(",
+    "exp(0 - ",
+    "abs(diff)",
+    "max(a, b)",
+    "min(a, b)",
+    "is_attractor(arr_get(",
+    "arr_fold_all(arr_resonance_vec(",
+    "harmony(arr_sum_int(",
+    "arr_resonance_vec(arr_fold_all(",
+    "h W = tape_var(",
+    "h b = tape_var(",
+    "h X = tape_var([[",
+    "h Z = tape_matmul(",
+    "h Y = tape_relu(",
+    "h L = tape_mean(",
+    "tape_backward(L);",
+    "tape_update(W, ",
+    "h dW = tape_grad(",
+    "py_import(",
+    "py_call(",
+    "py_call_method(",
+    "py_callback(",
+    "py_get(",
+    "py_set(",
+    "} else if ",
+    "} elif ",
+    "if found { break; }",
+    "if found == 0 {",
+    "return found;",
+    "return result;",
+    "return out;",
+    "return acc;",
+    "concat_many(\"",
+    "concat_many(msg, ",
+    "to_string(arr_len(",
+    "to_string(dict_size(",
+    "to_string(arr_get(",
+    "if type_of(v) == \"",
+    "if v == null",
+    "if v != null",
+    "arr_slice(xs, 0, ",
+    "arr_slice(xs, i, ",
+    "arr_take(xs, ",
+    "arr_drop(xs, ",
+    "dict_get_or(d, ",
+    "# ----",
+    "# ====",
+    "# ---- ",
+    "## ---",
+    "assert_eq(arr_get(",
+    "assert_true(arr_get(",
+    "assert_true(dict_has(",
+    "assert_eq(dict_size(",
+    "assert_eq(arr_len(",
+    "assert_eq(str_len(",
+    "arr_resonance_vec([",
+    "arr_him_vec([",
+    "arr_fold_all([",
+    "is_attractor(",
+    "attractor_distance(",
+    "fibonacci_index(",
+    "crt_recover([",
 ];
 
 /// Substrate distance between two token IDs. Returns the absolute