MCP: wire v0.3 predict tools (omc_predict + omc_corpus_size)

RandomCoder-lab · claude · RandomCoder-lab · commit 33bd9c5b4f65 · 2026-05-17T12:35:02.000-05:00
v0.3 shipped the substrate-indexed completion engine as an OMC builtin but exposing it over MCP is what makes it actually useful to LLM clients (Claude Desktop, Cursor, the MCP server in this very session). ## New MCP tools - `omc_predict(paths, prefix, top_k?)` — wraps predict_continuations end-to-end. Returns a JSON payload with prefix echo, corpus_size, top_k, and a `suggestions` array. Each suggestion carries fn_name, source (full body), file, canonical_hash, attractor, prefix_match_len, substrate_distance, query_attractor. top_k clamps to [1, 50] so a misconfigured client can't grab the whole corpus. - `omc_corpus_size(paths)` — diagnostic. Returns fn_count for a list of paths. Used to verify file resolution before a larger predict call. ## Implementation - Both handlers share a `parse_paths_arg` helper for the array-of-strings validation pattern and a `build_corpus` helper for the read-and-ingest-each-file pattern. I/O errors surface as MCP-style `isError: true` strings, not panics. - predict_continuations is called directly from main.rs via `omnimcode_core::predict::{CodeCorpus, predict_continuations}` — no eval_program detour, no display_value formatting. The structured JSON output matches what `omc_help` and others do. ## Tests (first MCP tests in the crate) 8 integration tests in tests/integration.rs spawn the binary and exercise JSON-RPC over stdio: - initialize returns server info - tools/list includes both new tools - omc_corpus_size ingests Prometheus (>30 fns) - omc_predict on `fn prom_linear_` returns exactly forward/new/params with provenance fields populated - top_k caps results - missing 'paths' arg → friendly error string with the tool name - unreadable path → friendly error naming the path - unknown tool name → isError: true with name in the message Final: 231 Rust pass (was 223 + 8 new integration), 1087/1087 OMC. ## End-to-end verification $ {echo init; echo tools/list; echo predict-call} | omnimcode-mcp initialize: server=omnimcode-mcp tools/list: 9 tools (was 7), predict present: True omc_corpus_size: 70 fns omc_predict (prefix='fn prom_linear_'): prom_linear_forward prefix_len=24 dist=1.37e+18 prom_linear_new prefix_len=24 dist=2.44e+18 prom_linear_params prefix_len=24 dist=5.51e+18 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
diff --git a/omnimcode-mcp/README.md b/omnimcode-mcp/README.md
@@ -16,8 +16,18 @@ needs to discover the language at runtime.
 - `omc_categories()` — list builtin categories
 - `omc_unique_builtins()` — OMC-only primitives (no NumPy equivalent)
 - `omc_explain_error(message)` — pattern-match an error against the
-  259-entry knowledge base; returns explanation + cause + fix
+  curated knowledge base; returns explanation + cause + fix
 - `omc_did_you_mean(name)` — typo suggestions over the known surface
+- **`omc_predict(paths, prefix, top_k?)`** — substrate-indexed code
+  completion ([v0.3 chapter](https://github.com/RandomCoder-lab/OMC/releases/tag/v0.3-symbolic-prediction)).
+  Given a partial OMC prefix (e.g. `fn prom_linear_`), returns the
+  top-k ranked continuations from a content-addressed corpus. Each
+  suggestion carries the full source, file path, canonical hash,
+  prefix-match depth, and substrate distance — branching is
+  first-class.
+- **`omc_corpus_size(paths)`** — diagnostic: how many top-level fns
+  resolve across a list of OMC files. Use to verify paths before a
+  predict call.
 
 ## Build
 
diff --git a/omnimcode-mcp/src/main.rs b/omnimcode-mcp/src/main.rs
@@ -28,6 +28,7 @@ use omnimcode_core::docs;
 use omnimcode_core::errors;
 use omnimcode_core::interpreter::Interpreter;
 use omnimcode_core::parser::Parser;
+use omnimcode_core::predict::{CodeCorpus, predict_continuations};
 use omnimcode_core::value::Value;
 
 #[derive(Debug, Deserialize)]
@@ -212,6 +213,55 @@ fn list_tools() -> Vec<Json> {
                 "required": ["name"]
             }
         }),
+        json!({
+            "name": "omc_predict",
+            "description": "Substrate-indexed code completion. Given a partial OMC code prefix \
+                            (e.g. `fn prom_linear_`), return the top-k ranked continuations from \
+                            a content-addressed corpus of OMC files. Each result is a viable \
+                            branch: it carries the full source of the matching fn, its file \
+                            path, canonical hash, prefix-match depth, and substrate distance. \
+                            Use to find similar fns when authoring code, to navigate a corpus \
+                            without grepping, or to surface stable callable shapes that an LLM \
+                            can adapt rather than invent from scratch.",
+            "inputSchema": {
+                "type": "object",
+                "properties": {
+                    "paths": {
+                        "type": "array",
+                        "items": { "type": "string" },
+                        "description": "Source file paths to ingest. Top-level fns from each file are added to the corpus."
+                    },
+                    "prefix": {
+                        "type": "string",
+                        "description": "Partial OMC source (e.g. `fn prom_linear_`). May be incomplete."
+                    },
+                    "top_k": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "default": 5,
+                        "description": "Number of ranked continuations to return."
+                    }
+                },
+                "required": ["paths", "prefix"]
+            }
+        }),
+        json!({
+            "name": "omc_corpus_size",
+            "description": "Diagnostic: report how many top-level fns are ingested across a list \
+                            of OMC source paths. Useful for verifying paths resolve before \
+                            building a larger predict query.",
+            "inputSchema": {
+                "type": "object",
+                "properties": {
+                    "paths": {
+                        "type": "array",
+                        "items": { "type": "string" },
+                        "description": "Source file paths to ingest."
+                    }
+                },
+                "required": ["paths"]
+            }
+        }),
     ]
 }
 
@@ -284,10 +334,74 @@ fn dispatch_tool(interp: &mut Interpreter, name: &str, args: &Json) -> Result<St
             let suggestions = docs::did_you_mean(name, 5);
             Ok(serde_json::to_string_pretty(&json!(suggestions)).unwrap())
         }
+        "omc_predict" => {
+            let paths = parse_paths_arg(args, "omc_predict")?;
+            let prefix = args.get("prefix").and_then(Json::as_str)
+                .ok_or_else(|| "omc_predict: missing 'prefix' arg".to_string())?;
+            // top_k optional, defaults to 5. Clamp to [1, 50] so a
+            // misconfigured client can't ask for the entire corpus.
+            let top_k = args.get("top_k").and_then(Json::as_i64)
+                .unwrap_or(5)
+                .clamp(1, 50) as usize;
+            let corpus = build_corpus(&paths)?;
+            let suggestions = predict_continuations(&corpus, prefix, top_k);
+            let payload = json!({
+                "prefix": prefix,
+                "corpus_size": corpus.len(),
+                "top_k": top_k,
+                "suggestions": suggestions.iter().map(|s| json!({
+                    "fn_name": s.fn_name,
+                    "source": s.source,
+                    "file": s.file,
+                    "canonical_hash": s.canonical_hash,
+                    "attractor": s.attractor,
+                    "prefix_match_len": s.prefix_match_len,
+                    "substrate_distance": s.substrate_distance,
+                    "query_attractor": s.query_attractor,
+                })).collect::<Vec<_>>(),
+            });
+            Ok(serde_json::to_string_pretty(&payload).unwrap())
+        }
+        "omc_corpus_size" => {
+            let paths = parse_paths_arg(args, "omc_corpus_size")?;
+            let corpus = build_corpus(&paths)?;
+            let payload = json!({
+                "paths": paths,
+                "fn_count": corpus.len(),
+            });
+            Ok(serde_json::to_string_pretty(&payload).unwrap())
+        }
         _ => Err(format!("Unknown tool: {}", name)),
     }
 }
 
+/// Extract a `paths` array argument from a tool's JSON args. Used by
+/// both omc_predict and omc_corpus_size — same shape, same validation.
+fn parse_paths_arg(args: &Json, tool: &str) -> Result<Vec<String>, String> {
+    let paths_val = args.get("paths")
+        .ok_or_else(|| format!("{}: missing 'paths' arg", tool))?;
+    let arr = paths_val.as_array()
+        .ok_or_else(|| format!("{}: 'paths' must be an array of strings", tool))?;
+    arr.iter()
+        .map(|v| v.as_str()
+            .ok_or_else(|| format!("{}: every 'paths' entry must be a string", tool))
+            .map(|s| s.to_string()))
+        .collect()
+}
+
+/// Build a CodeCorpus by reading + ingesting every file in `paths`.
+/// Surface I/O errors as MCP-style strings so the client sees a clean
+/// `isError: true` text instead of a panic.
+fn build_corpus(paths: &[String]) -> Result<CodeCorpus, String> {
+    let mut corpus = CodeCorpus::new();
+    for path in paths {
+        let src = std::fs::read_to_string(path)
+            .map_err(|e| format!("omc_predict: read {}: {}", path, e))?;
+        corpus.ingest_file(path, &src);
+    }
+    Ok(corpus)
+}
+
 /// Evaluate an OMC program. Errors come back as structured strings
 /// (the MCP client sees isError=true alongside the text). Each
 /// tools/call uses a fresh interpreter to avoid state bleed.
diff --git a/omnimcode-mcp/tests/integration.rs b/omnimcode-mcp/tests/integration.rs
@@ -0,0 +1,219 @@
+//! End-to-end MCP protocol tests.
+//!
+//! Spawns the binary, talks JSON-RPC over stdio, asserts on the
+//! responses. Covers the full request → handler → response path
+//! including JSON parsing and protocol-level errors.
+//!
+//! Why integration rather than unit tests: the crate is bin-only, so
+//! handler functions aren't reachable from a unit-test module. This
+//! also exercises the actual protocol path a real LLM client would use.
+
+use std::io::{BufRead, BufReader, Write};
+use std::path::PathBuf;
+use std::process::{Command, Stdio};
+
+use serde_json::{json, Value};
+
+/// Find the built `omnimcode-mcp` binary relative to the test
+/// executable's path (target/release/deps/integration-XXX or
+/// target/debug/deps/integration-XXX → target/{profile}/omnimcode-mcp).
+fn find_binary() -> PathBuf {
+    let exe = std::env::current_exe().expect("current_exe");
+    // exe is in target/<profile>/deps/integration-<hash>
+    // walk up to target/<profile>/
+    let target_profile_dir = exe.parent().unwrap().parent().unwrap();
+    let bin = target_profile_dir.join("omnimcode-mcp");
+    assert!(
+        bin.exists(),
+        "binary not found at {} — rebuild with `cargo build -p omnimcode-mcp`",
+        bin.display()
+    );
+    bin
+}
+
+/// Find the OMC repo root so test fixtures (`examples/lib/prometheus.omc`)
+/// can be referenced by relative path. CARGO_MANIFEST_DIR points at the
+/// crate dir; the repo root is one up.
+fn repo_root() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).parent().unwrap().to_path_buf()
+}
+
+/// Send a sequence of JSON-RPC request strings to the binary, return
+/// the parsed response Values in order. Runs the binary fresh, sets cwd
+/// to the OMC repo root so file-path arguments resolve.
+fn rpc_exchange(requests: &[Value]) -> Vec<Value> {
+    let bin = find_binary();
+    let mut child = Command::new(bin)
+        .current_dir(repo_root())
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::null())
+        .spawn()
+        .expect("spawn mcp server");
+    let mut stdin = child.stdin.take().expect("stdin");
+    let stdout = child.stdout.take().expect("stdout");
+    for r in requests {
+        writeln!(stdin, "{}", r).expect("write");
+    }
+    drop(stdin); // closes the server's stdin → it'll exit after replying
+    let reader = BufReader::new(stdout);
+    let mut responses = Vec::new();
+    for line in reader.lines() {
+        let line = line.expect("read");
+        if line.trim().is_empty() { continue; }
+        let v: Value = serde_json::from_str(&line)
+            .unwrap_or_else(|e| panic!("parse {}: {}", line, e));
+        responses.push(v);
+    }
+    let _ = child.wait();
+    responses
+}
+
+#[test]
+fn initialize_returns_server_info() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+    ]);
+    assert_eq!(responses.len(), 1);
+    let r = &responses[0];
+    assert_eq!(r["id"], 1);
+    assert_eq!(r["result"]["serverInfo"]["name"], "omnimcode-mcp");
+}
+
+#[test]
+fn tools_list_includes_predict_tools() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+        json!({"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}),
+    ]);
+    let tools = &responses[1]["result"]["tools"];
+    let names: Vec<&str> = tools.as_array().unwrap()
+        .iter()
+        .map(|t| t["name"].as_str().unwrap())
+        .collect();
+    assert!(names.contains(&"omc_predict"), "predict tool present: {:?}", names);
+    assert!(names.contains(&"omc_corpus_size"), "corpus_size present: {:?}", names);
+    // Pre-existing tools still there too.
+    assert!(names.contains(&"omc_eval"));
+    assert!(names.contains(&"omc_help"));
+}
+
+#[test]
+fn omc_corpus_size_ingests_prometheus() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+        json!({"jsonrpc":"2.0","id":2,"method":"tools/call","params":{
+            "name":"omc_corpus_size",
+            "arguments":{"paths":["examples/lib/prometheus.omc"]}
+        }}),
+    ]);
+    let r = &responses[1];
+    assert_eq!(r["result"]["isError"], false, "should not be an error: {}", r);
+    let text = r["result"]["content"][0]["text"].as_str().unwrap();
+    let payload: Value = serde_json::from_str(text).unwrap();
+    // Prometheus has ~70 fns currently; lower bound is the only stable assertion.
+    let n = payload["fn_count"].as_i64().unwrap();
+    assert!(n > 30, "expected >30 fns, got {}", n);
+}
+
+#[test]
+fn omc_predict_ranks_prom_linear_prefix() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+        json!({"jsonrpc":"2.0","id":2,"method":"tools/call","params":{
+            "name":"omc_predict",
+            "arguments":{
+                "paths":["examples/lib/prometheus.omc"],
+                "prefix":"fn prom_linear_",
+                "top_k":5
+            }
+        }}),
+    ]);
+    let r = &responses[1];
+    assert_eq!(r["result"]["isError"], false, "should not be an error: {}", r);
+    let text = r["result"]["content"][0]["text"].as_str().unwrap();
+    let payload: Value = serde_json::from_str(text).unwrap();
+    assert_eq!(payload["prefix"], "fn prom_linear_");
+    let suggestions = payload["suggestions"].as_array().unwrap();
+    assert!(suggestions.len() >= 3, "should have at least 3 hits for fn prom_linear_, got {}", suggestions.len());
+    let names: Vec<&str> = suggestions.iter()
+        .map(|s| s["fn_name"].as_str().unwrap())
+        .collect();
+    assert!(names.contains(&"prom_linear_new"), "missing prom_linear_new in {:?}", names);
+    assert!(names.contains(&"prom_linear_forward"), "missing prom_linear_forward in {:?}", names);
+    assert!(names.contains(&"prom_linear_params"), "missing prom_linear_params in {:?}", names);
+    // Each suggestion carries provenance fields.
+    let first = &suggestions[0];
+    assert!(first["source"].is_string(), "source field");
+    assert_eq!(first["file"], "examples/lib/prometheus.omc");
+    assert!(first["canonical_hash"].is_i64(), "canonical_hash field");
+    assert!(first["prefix_match_len"].as_i64().unwrap() > 0, "prefix matched some tokens");
+    assert!(first["substrate_distance"].as_i64().unwrap() >= 0);
+}
+
+#[test]
+fn omc_predict_top_k_caps_results() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+        json!({"jsonrpc":"2.0","id":2,"method":"tools/call","params":{
+            "name":"omc_predict",
+            "arguments":{
+                "paths":["examples/lib/prometheus.omc"],
+                "prefix":"fn prom_",
+                "top_k":2
+            }
+        }}),
+    ]);
+    let text = responses[1]["result"]["content"][0]["text"].as_str().unwrap();
+    let payload: Value = serde_json::from_str(text).unwrap();
+    let suggestions = payload["suggestions"].as_array().unwrap();
+    assert!(suggestions.len() <= 2, "top_k=2 capped at 2, got {}", suggestions.len());
+}
+
+#[test]
+fn omc_predict_missing_paths_is_a_friendly_error() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+        json!({"jsonrpc":"2.0","id":2,"method":"tools/call","params":{
+            "name":"omc_predict",
+            "arguments":{"prefix":"fn anything","top_k":3}
+        }}),
+    ]);
+    let r = &responses[1];
+    assert_eq!(r["result"]["isError"], true);
+    let text = r["result"]["content"][0]["text"].as_str().unwrap();
+    assert!(text.contains("missing 'paths'"), "error mentions missing paths: {}", text);
+}
+
+#[test]
+fn omc_predict_unreadable_path_is_friendly() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+        json!({"jsonrpc":"2.0","id":2,"method":"tools/call","params":{
+            "name":"omc_predict",
+            "arguments":{
+                "paths":["/nonexistent/path/does/not/exist.omc"],
+                "prefix":"fn foo"
+            }
+        }}),
+    ]);
+    let r = &responses[1];
+    assert_eq!(r["result"]["isError"], true);
+    let text = r["result"]["content"][0]["text"].as_str().unwrap();
+    assert!(text.contains("read") && text.contains("nonexistent"),
+            "names the bad path: {}", text);
+}
+
+#[test]
+fn unknown_tool_returns_error_text() {
+    let responses = rpc_exchange(&[
+        json!({"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}),
+        json!({"jsonrpc":"2.0","id":2,"method":"tools/call","params":{
+            "name":"omc_does_not_exist","arguments":{}
+        }}),
+    ]);
+    let r = &responses[1];
+    assert_eq!(r["result"]["isError"], true);
+    let text = r["result"]["content"][0]["text"].as_str().unwrap();
+    assert!(text.contains("Unknown tool"), "error mentions unknown tool: {}", text);
+}