Skip to content

Commit 4963cd3

Browse files
tokenizer: expand TOKEN_DICT by 285 builtins + 53 common idioms
Bulk-add every interpreter builtin not yet in TOKEN_DICT plus ~50 high-frequency OMC code idioms (h x = , i = i + 1;, arr_push(out, , assert_eq(, } else {, etc.). Greedy longest-match is order-stable so existing encoded streams keep round-tripping. New entries get high IDs (they don't displace the attractor-aligned 1..19 slots). Net effect: any OMC code using ANY of the 368 builtins now compresses; idiom-heavy code drops to ~1/3 its raw size. Measured compression on the demo workloads: arr_softmax([1.0...]) 1.75 -> 1.75 (already in dict) h x = arr_matmul(A, B); 1.77 -> 2.30 tape_reset();... 2.41 -> 2.79 if i < arr_len(xs)... 1.80 -> 2.14 Roundtrip exact on all 15 tokenizer tests after the change. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 75f597a commit 4963cd3

1 file changed

Lines changed: 340 additions & 0 deletions

File tree

omnimcode-core/src/tokenizer.rs

Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,346 @@ pub const TOKEN_DICT: &[&str] = &[
218218
// Common 2-char operators / openers
219219
"==", "!=", "<=", ">=", "&&", "||", "<<", ">>",
220220
"//", "/*", "*/",
221+
222+
// ---- Auto-appended bulk dict expansion (Phase 2) ----
223+
"abs",
224+
"acos",
225+
"arr_all",
226+
"arr_any",
227+
"arr_argmax",
228+
"arr_argmin",
229+
"arr_avg_distance",
230+
"arr_chunk",
231+
"arr_concat",
232+
"arr_contains",
233+
"arr_count",
234+
"arr_cumsum",
235+
"arr_diff",
236+
"arr_drop",
237+
"arr_enumerate",
238+
"arr_filter",
239+
"arr_find",
240+
"arr_first",
241+
"arr_flatten",
242+
"arr_fold_elements",
243+
"arr_from_range",
244+
"arr_gcd",
245+
"arr_geometric_mean",
246+
"arr_harmonic_mean",
247+
"arr_index_of",
248+
"arr_is_sorted",
249+
"arr_join",
250+
"arr_last",
251+
"arr_map",
252+
"arr_max",
253+
"arr_max_float",
254+
"arr_max_int",
255+
"arr_mean",
256+
"arr_median",
257+
"arr_min",
258+
"arr_min_float",
259+
"arr_min_int",
260+
"arr_neg",
261+
"arr_new",
262+
"arr_norm",
263+
"arr_ones",
264+
"arr_outer",
265+
"arr_partition_by",
266+
"arr_product",
267+
"arr_range",
268+
"arr_reduce",
269+
"arr_repeat",
270+
"arr_resonance",
271+
"arr_reverse",
272+
"arr_slice",
273+
"arr_sort",
274+
"arr_sort_int",
275+
"arr_stddev",
276+
"arr_sum",
277+
"arr_sum_int",
278+
"arr_sum_sq",
279+
"arr_take",
280+
"arr_unique",
281+
"arr_unique_count",
282+
"arr_variance",
283+
"arr_window",
284+
"arr_zeros",
285+
"arr_zip",
286+
"asin",
287+
"atan",
288+
"atan2",
289+
"attractor_bucket",
290+
"attractor_table",
291+
"bit_count",
292+
"bit_length",
293+
"call",
294+
"ceil",
295+
"clamp",
296+
"classify_resonance",
297+
"cleanup_array",
298+
"collapse",
299+
"cos",
300+
"crt_residues",
301+
"csv_parse",
302+
"cube",
303+
"defined_functions",
304+
"dict_clear",
305+
"dict_del",
306+
"dict_get_or",
307+
"dict_has",
308+
"dict_items",
309+
"dict_keys",
310+
"dict_len",
311+
"dict_merge",
312+
"dict_new",
313+
"dict_pop",
314+
"dict_size",
315+
"dict_values",
316+
"digit_count",
317+
"digit_sum",
318+
"dual_cos",
319+
"dual_exp",
320+
"dual_neg",
321+
"dual_pow_int",
322+
"dual_relu",
323+
"dual_sigmoid",
324+
"dual_sin",
325+
"dual_tanh",
326+
"dual_v",
327+
"e",
328+
"ensure_clean",
329+
"erf",
330+
"error",
331+
"even",
332+
"exp",
333+
"factorial",
334+
"fib",
335+
"fib_chunks",
336+
"fibonacci",
337+
"file_exists",
338+
"filter_by_resonance",
339+
"floor",
340+
"fnv1a_hash",
341+
"fold",
342+
"fold_escape",
343+
"frac",
344+
"from_zeckendorf",
345+
"gcd",
346+
"harmonic_align",
347+
"harmonic_checksum",
348+
"harmonic_dedupe",
349+
"harmonic_diff",
350+
"harmonic_hash",
351+
"harmonic_interfere",
352+
"harmonic_partition",
353+
"harmonic_partition_3",
354+
"harmonic_read_file",
355+
"harmonic_resample",
356+
"harmonic_score",
357+
"harmonic_sort",
358+
"harmonic_split",
359+
"harmonic_unalign",
360+
"harmonic_write_file",
361+
"harmony_value",
362+
"hbit_tension",
363+
"hypot",
364+
"int_binary_search",
365+
"int_lower_bound",
366+
"int_upper_bound",
367+
"interfere",
368+
"is_even",
369+
"is_fibonacci",
370+
"is_instance",
371+
"is_odd",
372+
"is_phi_resonant",
373+
"is_prime",
374+
"is_singularity",
375+
"is_zeckendorf_valid",
376+
"largest_attractor_at_most",
377+
"lcm",
378+
"len",
379+
"lerp",
380+
"ln_2",
381+
"log",
382+
"log10",
383+
"log2",
384+
"log_phi_pi_fibonacci",
385+
"max",
386+
"mean_omni_weight",
387+
"measure_coherence",
388+
"min",
389+
"mod_pow",
390+
"nearest_attractor",
391+
"now_ms",
392+
"nth_fibonacci",
393+
"odd",
394+
"omc_code_canonical",
395+
"omc_code_equivalent",
396+
"omc_error_categories",
397+
"omc_error_count",
398+
"omc_token_vocab_size",
399+
"phi",
400+
"phi_inv",
401+
"phi_pi_bin_search",
402+
"phi_pi_fib_nearest",
403+
"phi_pi_fib_nearest_traced",
404+
"phi_pi_fib_nearest_v2",
405+
"phi_pi_fib_reset",
406+
"phi_pi_fib_search",
407+
"phi_pi_fib_search_traced",
408+
"phi_pi_fib_search_v2",
409+
"phi_pi_fib_stats",
410+
"phi_pi_fib_stats_all",
411+
"phi_pi_fib_stats_bg",
412+
"phi_pi_log_distance",
413+
"phi_pi_pow",
414+
"phi_pow",
415+
"phi_shadow",
416+
"phi_sq",
417+
"phi_squared",
418+
"pi",
419+
"pow",
420+
"pow_int",
421+
"print_raw",
422+
"println",
423+
"quantization_ratio",
424+
"quantize",
425+
"random_float",
426+
"random_int",
427+
"random_seed",
428+
"re_find",
429+
"re_split",
430+
"read_file",
431+
"res",
432+
"resolve_singularity",
433+
"resonance_band",
434+
"resonance_band_histogram",
435+
"round",
436+
"safe_add",
437+
"safe_arr_get",
438+
"safe_arr_set",
439+
"safe_divide",
440+
"safe_log",
441+
"safe_mod",
442+
"safe_mul",
443+
"safe_sqrt",
444+
"safe_sub",
445+
"sigmoid",
446+
"sign",
447+
"sin",
448+
"sorted_dedupe",
449+
"sorted_merge",
450+
"sorted_union",
451+
"sqrt",
452+
"sqrt_2",
453+
"sqrt_5",
454+
"square",
455+
"str_capitalize",
456+
"str_chars",
457+
"str_concat",
458+
"str_contains",
459+
"str_count",
460+
"str_ends_with",
461+
"str_index_of",
462+
"str_is_empty",
463+
"str_lowercase",
464+
"str_pad_left",
465+
"str_pad_right",
466+
"str_repeat",
467+
"str_replace",
468+
"str_reverse",
469+
"str_split_lines",
470+
"str_starts_with",
471+
"str_to_float",
472+
"str_to_int",
473+
"str_trim",
474+
"str_uppercase",
475+
"substrate_count_range",
476+
"substrate_difference",
477+
"substrate_hash",
478+
"substrate_insert",
479+
"substrate_intersect",
480+
"substrate_lower_bound",
481+
"substrate_min_distance",
482+
"substrate_nearest",
483+
"substrate_quantile",
484+
"substrate_rank",
485+
"substrate_search",
486+
"substrate_select_k",
487+
"substrate_slice_range",
488+
"substrate_upper_bound",
489+
"tan",
490+
"tanh",
491+
"tape_neg",
492+
"tape_pow_int",
493+
"tau",
494+
"test_clear_failures",
495+
"test_failure_count",
496+
"test_get_current",
497+
"test_get_failures",
498+
"test_record_failure",
499+
"test_set_current",
500+
"to_float",
501+
"to_int",
502+
"type_of",
503+
"value_danger",
504+
"write_file",
505+
"zeckendorf",
506+
"zeckendorf_bit",
507+
"zeckendorf_weight",
508+
" 0;\n",
509+
" 1;\n",
510+
" 2;\n",
511+
" -1;\n",
512+
"h x = ",
513+
"h y = ",
514+
"h i = ",
515+
"h s = ",
516+
"h n = ",
517+
"h r = ",
518+
"h sum = 0",
519+
"h count = 0",
520+
"h result = ",
521+
"i = i + 1;",
522+
"j = j + 1;",
523+
"k = k + 1;",
524+
" < n {",
525+
" < arr_len(",
526+
"} else {",
527+
"} else if ",
528+
"while i < ",
529+
"for x in ",
530+
"for v in ",
531+
"fn test_",
532+
"test_record_failure(",
533+
"assert_eq(",
534+
"assert_true(",
535+
"assert_true(arr_len(",
536+
" == 1, \"",
537+
" == 0, \"",
538+
"approx_eq(",
539+
"to_string(",
540+
".items.borrow()",
541+
"if arr_get(",
542+
"return arr_get(",
543+
"arr_push(out, ",
544+
"h out = [];",
545+
"h out = arr_new()",
546+
"h xs = [",
547+
"h ys = [",
548+
"if condition",
549+
"is empty",
550+
"out of bounds",
551+
"shape mismatch",
552+
" }\n",
553+
" {\n ",
554+
" {\n",
555+
");\n",
556+
", ",
557+
" + 1",
558+
" - 1",
559+
" * 2",
560+
" / 2",
221561
];
222562

223563
/// Substrate distance between two token IDs. Returns the absolute

0 commit comments

Comments
 (0)