|
| 1 | +# Substrate-token adapter — the LLM compression / semantic-distance layer. |
| 2 | + |
| 3 | +fn assert_eq(actual, expected, msg) { |
| 4 | + if actual != expected { |
| 5 | + test_record_failure(msg + ": expected " + to_string(expected) + " got " + to_string(actual)); |
| 6 | + } |
| 7 | +} |
| 8 | + |
| 9 | +fn assert_true(cond, msg) { |
| 10 | + if !cond { test_record_failure(msg); } |
| 11 | +} |
| 12 | + |
| 13 | +fn approx_eq(a, b, tol) { |
| 14 | + h d = a - b; |
| 15 | + if d < 0.0 { d = 0.0 - d; } |
| 16 | + return d <= tol; |
| 17 | +} |
| 18 | + |
| 19 | +# ---- Encode/decode round-trip ---- |
| 20 | + |
| 21 | +fn test_roundtrip_simple() { |
| 22 | + h src = "h x = arr_softmax([1.0]);"; |
| 23 | + h ids = omc_token_encode(src); |
| 24 | + h back = omc_token_decode(ids); |
| 25 | + assert_eq(back, src, "simple roundtrip"); |
| 26 | +} |
| 27 | + |
| 28 | +fn test_roundtrip_multiline() { |
| 29 | + h src = "fn main() {\n h x = arr_get([1, 2, 3], 0);\n return x;\n}"; |
| 30 | + h ids = omc_token_encode(src); |
| 31 | + h back = omc_token_decode(ids); |
| 32 | + assert_eq(back, src, "multiline roundtrip"); |
| 33 | +} |
| 34 | + |
| 35 | +fn test_roundtrip_unicode_via_escape() { |
| 36 | + # Non-ASCII bytes get escaped as [0, byte] pairs. |
| 37 | + h src = "h α = 3;"; |
| 38 | + h ids = omc_token_encode(src); |
| 39 | + h back = omc_token_decode(ids); |
| 40 | + assert_eq(back, src, "unicode roundtrip"); |
| 41 | +} |
| 42 | + |
| 43 | +fn test_empty_string() { |
| 44 | + h src = ""; |
| 45 | + h ids = omc_token_encode(src); |
| 46 | + assert_eq(arr_len(ids), 0, "empty source → empty ids"); |
| 47 | + h back = omc_token_decode(ids); |
| 48 | + assert_eq(back, "", "empty roundtrip"); |
| 49 | +} |
| 50 | + |
| 51 | +# ---- Vocab & compression ---- |
| 52 | + |
| 53 | +fn test_vocab_nonempty() { |
| 54 | + h v = omc_token_vocab(); |
| 55 | + assert_true(arr_len(v) > 100, "vocab has >100 entries"); |
| 56 | + h size = omc_token_vocab_size(); |
| 57 | + assert_eq(size, arr_len(v), "vocab_size matches array length"); |
| 58 | +} |
| 59 | + |
| 60 | +fn test_vocab_id_0_is_escape() { |
| 61 | + h v = omc_token_vocab(); |
| 62 | + h first = arr_get(v, 0); |
| 63 | + # The escape sentinel — should not be a normal substring. |
| 64 | + assert_true(str_len(first) > 0, "ID 0 is a non-empty sentinel"); |
| 65 | +} |
| 66 | + |
| 67 | +fn test_compression_is_real() { |
| 68 | + h src = "h x = arr_softmax([1.0]); h y = arr_softmax([2.0]); h z = arr_softmax([3.0]);"; |
| 69 | + h ratio = omc_token_compression_ratio(src); |
| 70 | + # Each `arr_softmax` (11 bytes) collapses to a single ID. |
| 71 | + assert_true(ratio > 1.0, "compression ratio > 1"); |
| 72 | +} |
| 73 | + |
| 74 | +# ---- Substrate distance between token IDs ---- |
| 75 | + |
| 76 | +fn test_token_distance_self_is_zero() { |
| 77 | + assert_eq(omc_token_distance(3, 3), 0, "self-distance is 0"); |
| 78 | + assert_eq(omc_token_distance(8, 8), 0, "self-distance is 0"); |
| 79 | +} |
| 80 | + |
| 81 | +fn test_token_distance_close_for_attractors() { |
| 82 | + # IDs 3 and 5 are both Fibonacci attractors → small distance. |
| 83 | + h d1 = omc_token_distance(3, 5); |
| 84 | + # IDs 3 and 100 → large distance. |
| 85 | + h d2 = omc_token_distance(3, 100); |
| 86 | + assert_true(d1 < d2, "near IDs have smaller distance than far IDs"); |
| 87 | +} |
| 88 | + |
| 89 | +# ---- CRT pack / unpack ---- |
| 90 | + |
| 91 | +fn test_crt_roundtrip_default_moduli() { |
| 92 | + h packed = omc_token_pack([3, 42, 7]); |
| 93 | + h unpacked = omc_token_unpack(packed); |
| 94 | + assert_eq(arr_len(unpacked), 3, "unpacked has 3 streams"); |
| 95 | + assert_eq(arr_get(unpacked, 0), 3, "stream 0 preserved"); |
| 96 | + assert_eq(arr_get(unpacked, 1), 42, "stream 1 preserved"); |
| 97 | + assert_eq(arr_get(unpacked, 2), 7, "stream 2 preserved"); |
| 98 | +} |
| 99 | + |
| 100 | +fn test_crt_custom_moduli() { |
| 101 | + h moduli = [3, 5, 7]; |
| 102 | + h packed = omc_token_pack([1, 2, 4], moduli); |
| 103 | + h unpacked = omc_token_unpack(packed, moduli); |
| 104 | + assert_eq(arr_get(unpacked, 0), 1, "stream 0"); |
| 105 | + assert_eq(arr_get(unpacked, 1), 2, "stream 1"); |
| 106 | + assert_eq(arr_get(unpacked, 2), 4, "stream 2"); |
| 107 | +} |
| 108 | + |
| 109 | +# ---- Code-hash equivalence ---- |
| 110 | + |
| 111 | +fn test_code_hash_same_for_same_code() { |
| 112 | + h a = omc_code_hash("arr_softmax([1, 2, 3])"); |
| 113 | + h b = omc_code_hash("arr_softmax([1, 2, 3])"); |
| 114 | + assert_eq(dict_get(a, "attractor"), dict_get(b, "attractor"), |
| 115 | + "identical code → same attractor"); |
| 116 | + assert_eq(dict_get(a, "raw"), dict_get(b, "raw"), |
| 117 | + "identical code → same raw hash"); |
| 118 | +} |
| 119 | + |
| 120 | +fn test_code_hash_returns_full_dict() { |
| 121 | + h h_dict = omc_code_hash("h x = 1;"); |
| 122 | + assert_true(str_len(to_string(dict_get(h_dict, "raw"))) > 0, "has raw"); |
| 123 | + assert_true(str_len(to_string(dict_get(h_dict, "attractor"))) > 0, "has attractor"); |
| 124 | + assert_true(str_len(to_string(dict_get(h_dict, "distance"))) >= 0, "has distance"); |
| 125 | + assert_true(str_len(to_string(dict_get(h_dict, "resonance"))) > 0, "has resonance"); |
| 126 | +} |
| 127 | + |
| 128 | +fn test_code_distance_zero_for_identical() { |
| 129 | + h d = omc_code_distance("h x = 1;", "h x = 1;"); |
| 130 | + assert_eq(d, 0, "identical code → distance 0"); |
| 131 | +} |
| 132 | + |
| 133 | +fn test_code_distance_nonzero_for_different() { |
| 134 | + h d = omc_code_distance("h x = 1;", "h x = 999;"); |
| 135 | + assert_true(d > 0, "different code → positive distance"); |
| 136 | +} |
0 commit comments