OpenBMB
diff --git a/‎.pi/extensions/skill-discovery/engines/embedding-search.ts‎
Lines changed: 39 additions & 0 deletions b/‎.pi/extensions/skill-discovery/engines/embedding-search.ts‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎.pi/extensions/skill-discovery/engines/inverted-index.ts‎
Lines changed: 145 additions & 0 deletions b/‎.pi/extensions/skill-discovery/engines/inverted-index.ts‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎.pi/extensions/skill-discovery/engines/model-judge.ts‎
Lines changed: 42 additions & 0 deletions b/‎.pi/extensions/skill-discovery/engines/model-judge.ts‎
Lines changed: 42 additions & 0 deletions
@@ -0,0 +1,39 @@
+import type { SearchEngine, SearchResult } from "../types.js";
+
+export interface EmbeddingConfig {
+  provider?: "google" | "openai" | "custom";
+  apiKey?: string;
+  baseUrl?: string;
+  model?: string;
+}
+
+/**
+ * Embedding vector search engine.
+ *
+ * Enabled when config.embedding has both provider and apiKey.
+ * Currently a stub — search() returns [] but `available` reflects config.
+ *
+ * TODO: Implement actual embedding logic:
+ * - Pre-compute skill embeddings at init
+ * - On search: embed query, cosine similarity against cached vectors
+ * - Incremental update on chokidar change
+ */
+export class EmbeddingSearch implements SearchEngine {
+  readonly name = "embedding";
+  readonly available: boolean;
+  private config: EmbeddingConfig;
+
+  constructor(cfg?: EmbeddingConfig) {
+    this.config = cfg ?? {};
+    this.available = !!(this.config.apiKey && this.config.provider);
+  }
+
+  async init(): Promise<void> {
+    // TODO: pre-compute skill embedding vectors using this.config
+  }
+
+  async search(_query: string): Promise<SearchResult[]> {
+    // TODO: embed query → cosine similarity → return ranked results
+    return [];
+  }
+}
@@ -0,0 +1,145 @@
+import type { SearchEngine, SearchResult, SkillMeta } from "../types.js";
+
+export function tokenize(text: string): string[] {
+  const raw = text.toLowerCase().replace(/[^\w\u4e00-\u9fff]+/g, " ");
+  const tokens: string[] = [];
+
+  for (const part of raw.split(/\s+/)) {
+    if (!part) continue;
+
+    // Split CJK runs into bigrams for better matching
+    const cjkRuns = part.match(/[\u4e00-\u9fff]+/g);
+    const asciiRuns = part.match(/[a-z0-9_]+/g);
+
+    if (cjkRuns) {
+      for (const run of cjkRuns) {
+        if (run.length <= 2) {
+          tokens.push(run);
+        } else {
+          for (let i = 0; i < run.length - 1; i++) {
+            tokens.push(run.slice(i, i + 2));
+          }
+        }
+      }
+    }
+
+    if (asciiRuns) {
+      for (const run of asciiRuns) {
+        if (run.length > 1) tokens.push(run);
+      }
+    }
+  }
+
+  return tokens;
+}
+
+export class InvertedIndex implements SearchEngine {
+  readonly name = "bm25";
+  readonly available = true;
+
+  private index = new Map<string, SkillMeta[]>();
+  private allSkills: SkillMeta[] = [];
+  private avgDocLen = 0;
+
+  build(skills: SkillMeta[]) {
+    this.allSkills = skills;
+    this.index.clear();
+    let totalTokens = 0;
+
+    for (const skill of skills) {
+      totalTokens += skill.tokens.length;
+      const uniqueTokens = new Set(skill.tokens);
+      for (const token of uniqueTokens) {
+        const list = this.index.get(token);
+        if (list) {
+          list.push(skill);
+        } else {
+          this.index.set(token, [skill]);
+        }
+      }
+    }
+
+    this.avgDocLen = skills.length > 0 ? totalTokens / skills.length : 0;
+  }
+
+  addSkill(skill: SkillMeta) {
+    this.allSkills.push(skill);
+    const uniqueTokens = new Set(skill.tokens);
+    for (const token of uniqueTokens) {
+      const list = this.index.get(token);
+      if (list) {
+        list.push(skill);
+      } else {
+        this.index.set(token, [skill]);
+      }
+    }
+    this.recalcAvgDocLen();
+  }
+
+  removeSkill(name: string) {
+    this.allSkills = this.allSkills.filter((s) => s.name !== name);
+    for (const [token, skills] of this.index) {
+      const filtered = skills.filter((s) => s.name !== name);
+      if (filtered.length === 0) {
+        this.index.delete(token);
+      } else {
+        this.index.set(token, filtered);
+      }
+    }
+    this.recalcAvgDocLen();
+  }
+
+  private recalcAvgDocLen() {
+    const total = this.allSkills.reduce((s, sk) => s + sk.tokens.length, 0);
+    this.avgDocLen = this.allSkills.length > 0 ? total / this.allSkills.length : 0;
+  }
+
+  async search(query: string): Promise<SearchResult[]> {
+    return this.searchSync(query);
+  }
+
+  searchSync(query: string): SearchResult[] {
+    const queryTokens = tokenize(query);
+    if (queryTokens.length === 0) return [];
+
+    const N = this.allSkills.length;
+    if (N === 0) return [];
+
+    const k1 = 1.2;
+    const b = 0.75;
+    const scores = new Map<string, number>();
+
+    for (const token of queryTokens) {
+      const matchingSkills = this.index.get(token);
+      if (!matchingSkills) continue;
+
+      const df = matchingSkills.length;
+      const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1);
+
+      for (const skill of matchingSkills) {
+        const tf = skill.tokens.filter((t) => t === token).length;
+        const dl = skill.tokens.length;
+        const tfNorm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + (b * dl) / this.avgDocLen));
+        scores.set(skill.name, (scores.get(skill.name) || 0) + idf * tfNorm);
+      }
+    }
+
+    return [...scores.entries()]
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 3)
+      .map(([name, score]) => ({
+        skill: this.allSkills.find((s) => s.name === name)!,
+        score,
+        source: "bm25" as const,
+      }))
+      .filter((r) => r.skill);
+  }
+
+  getSkillCount(): number {
+    return this.allSkills.length;
+  }
+
+  getSkill(name: string): SkillMeta | undefined {
+    return this.allSkills.find((s) => s.name === name);
+  }
+}
@@ -0,0 +1,42 @@
+import type { SearchEngine, SearchResult } from "../types.js";
+
+export interface ModelJudgeConfig {
+  provider?: string;
+  apiKey?: string;
+  baseUrl?: string;
+  model?: string;
+}
+
+/**
+ * Small model judge search engine.
+ *
+ * Enabled when config.modelJudge has both provider and apiKey.
+ * Currently a stub — search() returns [] but `available` reflects config.
+ *
+ * TODO: Implement actual judge logic:
+ * - Call chat completion with a judge prompt listing all skill names + descriptions
+ * - Parse model output for top-N matching skill names
+ * - 3s timeout via Promise.race
+ *
+ * Recommended models (cheapest first):
+ *   gemini-2.0-flash-lite, gpt-4.1-nano, groq/llama-3.1-8b-instant
+ */
+export class ModelJudge implements SearchEngine {
+  readonly name = "model-judge";
+  readonly available: boolean;
+  private config: ModelJudgeConfig;
+
+  constructor(cfg?: ModelJudgeConfig) {
+    this.config = cfg ?? {};
+    this.available = !!(this.config.apiKey && this.config.provider);
+  }
+
+  async init(): Promise<void> {
+    // TODO: validate config, warm up connection
+  }
+
+  async search(_query: string): Promise<SearchResult[]> {
+    // TODO: call small model with judge prompt → parse skill names → return results
+    return [];
+  }
+}