add llm_judge, llm_compare, file_ls builtins

RandomCoder-lab · claude · RandomCoder-lab · commit d1b0677b4ec8 · 2026-05-18T20:40:02.000-05:00
- New: llm_judge(responses[], criteria, model?) — score N responses 1-10
  via structured LLM eval; returns [{idx, score, reason}] array
- New: llm_compare(a, b, criteria, model?) — pick winner of two responses;
  returns {winner: "A"|"B", reason: "..."}
- New: file_ls(path?) — list directory entries as sorted string array
- ALL_BUILTINS list deduplicated and extended with new LLM builtins;
  is_known_builtin also updated for first-class function support

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/omnimcode-core/src/interpreter.rs b/omnimcode-core/src/interpreter.rs
@@ -2229,9 +2229,11 @@ impl Interpreter {
             | "sha256" | "sha512" | "base64_encode" | "base64_decode"
             // LLM builtins
             | "llm_call" | "llm_chat" | "llm_embed" | "llm_models" | "llm_system"
-            | "llm_stream_print"
+            | "llm_stream_print" | "llm_judge" | "llm_compare"
             | "llm_tools" | "substrate_embed"
             | "batch_llm_call" | "batch_llm_chat"
+            // File utilities
+            | "file_ls"
             // HTTP builtins
             | "http_get" | "http_post" | "http_post_json" | "http_put" | "http_delete"
             | "now_iso" | "now_unix" | "format_time" | "parse_time"
@@ -4867,6 +4869,22 @@ impl Interpreter {
                 let exists = std::path::Path::new(&path).exists();
                 Ok(Value::HInt(HInt::new(if exists { 1 } else { 0 })))
             }
+            "file_ls" => {
+                let path = if args.is_empty() {
+                    ".".to_string()
+                } else {
+                    self.eval_expr(&args[0])?.to_display_string()
+                };
+                let entries = std::fs::read_dir(&path)
+                    .map_err(|e| format!("file_ls: {}", e))?;
+                let mut names: Vec<Value> = Vec::new();
+                for entry in entries.flatten() {
+                    let name = entry.file_name().to_string_lossy().to_string();
+                    names.push(Value::String(name));
+                }
+                names.sort_by(|a, b| a.to_display_string().cmp(&b.to_display_string()));
+                Ok(Value::Array(HArray::from_vec(names)))
+            }
             // Introspection and utility.
             "type_of" => {
                 if args.is_empty() {
@@ -9601,6 +9619,39 @@ impl Interpreter {
                 };
                 crate::llm_builtins::llm_stream_print(&prompt, system.as_deref(), model.as_deref())
             }
+            // llm_judge(responses, criteria, model?) -> dict[]
+            //   Score each response in an array; returns [{idx, score, reason}] sorted best-first.
+            "llm_judge" => {
+                if args.len() < 2 {
+                    return Err("llm_judge requires (responses, criteria, model?)".to_string());
+                }
+                let responses = self.eval_expr(&args[0])?;
+                let criteria = self.eval_expr(&args[1])?.to_display_string();
+                let model = if args.len() > 2 {
+                    match self.eval_expr(&args[2])? {
+                        Value::Null => None,
+                        v => Some(v.to_display_string()),
+                    }
+                } else { None };
+                crate::llm_builtins::llm_judge(&responses, &criteria, model.as_deref())
+            }
+            // llm_compare(a, b, criteria, model?) -> dict
+            //   Pick the better of two responses; returns {winner: "A"|"B", reason: "..."}.
+            "llm_compare" => {
+                if args.len() < 3 {
+                    return Err("llm_compare requires (a, b, criteria, model?)".to_string());
+                }
+                let a = self.eval_expr(&args[0])?.to_display_string();
+                let b = self.eval_expr(&args[1])?.to_display_string();
+                let criteria = self.eval_expr(&args[2])?.to_display_string();
+                let model = if args.len() > 3 {
+                    match self.eval_expr(&args[3])? {
+                        Value::Null => None,
+                        v => Some(v.to_display_string()),
+                    }
+                } else { None };
+                crate::llm_builtins::llm_compare(&a, &b, &criteria, model.as_deref())
+            }
             // llm_models() -> dict[]
             //   Returns the list of models available from the active provider.
             //   Each element is a dict with at least {"id": string, "provider": string}.
@@ -14555,8 +14606,10 @@ pub(crate) const HEAL_BUILTIN_NAMES: &[&str] = &[
     "re_match", "re_find", "re_find_all", "re_replace", "re_split",
     "json_parse", "json_stringify", "json_extract", "str_format",
     "sha256", "sha512", "base64_encode", "base64_decode",
-    // LLM builtins (Anthropic API — enabled with llm-builtins feature)
-    "llm_call", "llm_chat", "llm_embed",
+    // LLM builtins
+    "llm_call", "llm_chat", "llm_embed", "llm_models", "llm_system",
+    "llm_stream_print", "llm_judge", "llm_compare",
+    "llm_tools", "substrate_embed",
     "batch_llm_call", "batch_llm_chat",
     // Native HTTP builtins
     "http_get", "http_post", "http_post_json", "http_put", "http_delete",
@@ -14633,7 +14686,7 @@ pub(crate) const HEAL_BUILTIN_NAMES: &[&str] = &[
     "is_singularity", "ensure_clean", "collapse", "invert",
     "quantize", "quantization_ratio",
     // I/O
-    "read_file", "write_file", "file_exists", "print",
+    "read_file", "write_file", "file_exists", "file_ls", "print",
     "println", "print_raw",
     // Time / random / conversion / introspection
     "now_ms", "random_int", "random_float", "random_seed",
diff --git a/omnimcode-core/src/llm_builtins.rs b/omnimcode-core/src/llm_builtins.rs
@@ -205,6 +205,96 @@ pub fn llm_stream_print(
     Err("llm_stream_print: recompile with --features native-llm".to_string())
 }
 
+/// `llm_judge(responses, criteria, model?) -> dict[]`
+///
+/// Scores each response (array of strings) against `criteria` and returns
+/// an array of `{idx, score, reason}` dicts sorted best-first.
+pub fn llm_judge(
+    responses: &Value,
+    criteria: &str,
+    model_override: Option<&str>,
+) -> Result<Value, String> {
+    let items = match responses {
+        Value::Array(a) => a.items.borrow().clone(),
+        _ => return Err("llm_judge: first arg must be an array of strings".to_string()),
+    };
+
+    let mut prompt = format!(
+        "Score each response below (1-10) based on: {criteria}\n\
+         Return ONLY JSON: [{{\"idx\":0,\"score\":8,\"reason\":\"...\"}}, ...]\n\n"
+    );
+    for (i, item) in items.iter().enumerate() {
+        prompt.push_str(&format!("[{}]: {}\n---\n", i, item.to_display_string()));
+    }
+
+    let sys = "You are a precise evaluator. Output only valid JSON with no extra text.";
+    let raw = llm_call_sys(&prompt, model_override, Some(sys))?;
+    let text = match raw { Value::String(s) => s, _ => return Ok(Value::Array(HArray::from_vec(vec![]))) };
+
+    // Find first '[' and scan for the array
+    let bytes = text.as_bytes();
+    for start in 0..bytes.len() {
+        if bytes[start] == b'[' {
+            for end in (start + 1..=bytes.len()).rev() {
+                if let Ok(v) = serde_json::from_str::<serde_json::Value>(&text[start..end]) {
+                    return Ok(json_to_value(&v));
+                }
+            }
+        }
+    }
+    Ok(Value::Array(HArray::from_vec(vec![])))
+}
+
+#[cfg(not(feature = "native-llm"))]
+pub fn llm_judge(
+    _responses: &Value,
+    _criteria: &str,
+    _model_override: Option<&str>,
+) -> Result<Value, String> {
+    Err("llm_judge: recompile with --features native-llm".to_string())
+}
+
+/// `llm_compare(a, b, criteria, model?) -> dict`
+///
+/// Compares two responses and returns `{winner: "A"|"B", reason: "..."}`.
+pub fn llm_compare(
+    a: &str,
+    b: &str,
+    criteria: &str,
+    model_override: Option<&str>,
+) -> Result<Value, String> {
+    let prompt = format!(
+        "Compare these two responses based on: {criteria}\n\n\
+         [A]: {a}\n\n[B]: {b}\n\n\
+         Return ONLY JSON: {{\"winner\":\"A\",\"reason\":\"...\"}}"
+    );
+    let sys = "You are an impartial judge. Output only valid JSON.";
+    let raw = llm_call_sys(&prompt, model_override, Some(sys))?;
+    let text = match raw { Value::String(s) => s, _ => return Ok(Value::Null) };
+
+    let bytes = text.as_bytes();
+    for start in 0..bytes.len() {
+        if bytes[start] == b'{' {
+            for end in (start + 1..=bytes.len()).rev() {
+                if let Ok(v) = serde_json::from_str::<serde_json::Value>(&text[start..end]) {
+                    return Ok(json_to_value(&v));
+                }
+            }
+        }
+    }
+    Ok(Value::Null)
+}
+
+#[cfg(not(feature = "native-llm"))]
+pub fn llm_compare(
+    _a: &str,
+    _b: &str,
+    _criteria: &str,
+    _model_override: Option<&str>,
+) -> Result<Value, String> {
+    Err("llm_compare: recompile with --features native-llm".to_string())
+}
+
 /// `batch_llm_call(prompts, model?, concurrency?) -> string[]`
 ///
 /// Send multiple prompts to the LLM sequentially and return all responses in