Skip to content

Commit 26b94bb

Browse files
committed
feat: add skill-discovery extension
BM25-based skill search with dynamic injection and compression recovery, inspired by modern agent skill discovery patterns. Core capabilities: - Turn 0 discovery: BM25 keyword search on user input - Inter-turn discovery: re-search after write/edit operations - Dynamic injection: matched skills injected as <system-reminder> - Compression recovery: SkillBackup restores skill content after compaction - Dual-source indexing: user skills (pi.getCommands) + built-in skills (System Prompt) - Hot-reload: chokidar watches SKILL.md changes with 300ms debounce - Logging: persistent log at ~/.openclaw/logs/skill-discovery.log - Config: auto-generated ~/.openclaw/skill-discovery.json with enabled switch Made-with: Cursor
1 parent 57ded59 commit 26b94bb

8 files changed

Lines changed: 1268 additions & 0 deletions

File tree

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import type { SearchEngine, SearchResult } from "../types.js";
2+
3+
export interface EmbeddingConfig {
4+
provider?: "google" | "openai" | "custom";
5+
apiKey?: string;
6+
baseUrl?: string;
7+
model?: string;
8+
}
9+
10+
/**
11+
* Embedding vector search engine.
12+
*
13+
* Enabled when config.embedding has both provider and apiKey.
14+
* Currently a stub — search() returns [] but `available` reflects config.
15+
*
16+
* TODO: Implement actual embedding logic:
17+
* - Pre-compute skill embeddings at init
18+
* - On search: embed query, cosine similarity against cached vectors
19+
* - Incremental update on chokidar change
20+
*/
21+
export class EmbeddingSearch implements SearchEngine {
22+
readonly name = "embedding";
23+
readonly available: boolean;
24+
private config: EmbeddingConfig;
25+
26+
constructor(cfg?: EmbeddingConfig) {
27+
this.config = cfg ?? {};
28+
this.available = !!(this.config.apiKey && this.config.provider);
29+
}
30+
31+
async init(): Promise<void> {
32+
// TODO: pre-compute skill embedding vectors using this.config
33+
}
34+
35+
async search(_query: string): Promise<SearchResult[]> {
36+
// TODO: embed query → cosine similarity → return ranked results
37+
return [];
38+
}
39+
}
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import type { SearchEngine, SearchResult, SkillMeta } from "../types.js";
2+
3+
export function tokenize(text: string): string[] {
4+
const raw = text.toLowerCase().replace(/[^\w\u4e00-\u9fff]+/g, " ");
5+
const tokens: string[] = [];
6+
7+
for (const part of raw.split(/\s+/)) {
8+
if (!part) continue;
9+
10+
// Split CJK runs into bigrams for better matching
11+
const cjkRuns = part.match(/[\u4e00-\u9fff]+/g);
12+
const asciiRuns = part.match(/[a-z0-9_]+/g);
13+
14+
if (cjkRuns) {
15+
for (const run of cjkRuns) {
16+
if (run.length <= 2) {
17+
tokens.push(run);
18+
} else {
19+
for (let i = 0; i < run.length - 1; i++) {
20+
tokens.push(run.slice(i, i + 2));
21+
}
22+
}
23+
}
24+
}
25+
26+
if (asciiRuns) {
27+
for (const run of asciiRuns) {
28+
if (run.length > 1) tokens.push(run);
29+
}
30+
}
31+
}
32+
33+
return tokens;
34+
}
35+
36+
export class InvertedIndex implements SearchEngine {
37+
readonly name = "bm25";
38+
readonly available = true;
39+
40+
private index = new Map<string, SkillMeta[]>();
41+
private allSkills: SkillMeta[] = [];
42+
private avgDocLen = 0;
43+
44+
build(skills: SkillMeta[]) {
45+
this.allSkills = skills;
46+
this.index.clear();
47+
let totalTokens = 0;
48+
49+
for (const skill of skills) {
50+
totalTokens += skill.tokens.length;
51+
const uniqueTokens = new Set(skill.tokens);
52+
for (const token of uniqueTokens) {
53+
const list = this.index.get(token);
54+
if (list) {
55+
list.push(skill);
56+
} else {
57+
this.index.set(token, [skill]);
58+
}
59+
}
60+
}
61+
62+
this.avgDocLen = skills.length > 0 ? totalTokens / skills.length : 0;
63+
}
64+
65+
addSkill(skill: SkillMeta) {
66+
this.allSkills.push(skill);
67+
const uniqueTokens = new Set(skill.tokens);
68+
for (const token of uniqueTokens) {
69+
const list = this.index.get(token);
70+
if (list) {
71+
list.push(skill);
72+
} else {
73+
this.index.set(token, [skill]);
74+
}
75+
}
76+
this.recalcAvgDocLen();
77+
}
78+
79+
removeSkill(name: string) {
80+
this.allSkills = this.allSkills.filter((s) => s.name !== name);
81+
for (const [token, skills] of this.index) {
82+
const filtered = skills.filter((s) => s.name !== name);
83+
if (filtered.length === 0) {
84+
this.index.delete(token);
85+
} else {
86+
this.index.set(token, filtered);
87+
}
88+
}
89+
this.recalcAvgDocLen();
90+
}
91+
92+
private recalcAvgDocLen() {
93+
const total = this.allSkills.reduce((s, sk) => s + sk.tokens.length, 0);
94+
this.avgDocLen = this.allSkills.length > 0 ? total / this.allSkills.length : 0;
95+
}
96+
97+
async search(query: string): Promise<SearchResult[]> {
98+
return this.searchSync(query);
99+
}
100+
101+
searchSync(query: string): SearchResult[] {
102+
const queryTokens = tokenize(query);
103+
if (queryTokens.length === 0) return [];
104+
105+
const N = this.allSkills.length;
106+
if (N === 0) return [];
107+
108+
const k1 = 1.2;
109+
const b = 0.75;
110+
const scores = new Map<string, number>();
111+
112+
for (const token of queryTokens) {
113+
const matchingSkills = this.index.get(token);
114+
if (!matchingSkills) continue;
115+
116+
const df = matchingSkills.length;
117+
const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1);
118+
119+
for (const skill of matchingSkills) {
120+
const tf = skill.tokens.filter((t) => t === token).length;
121+
const dl = skill.tokens.length;
122+
const tfNorm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + (b * dl) / this.avgDocLen));
123+
scores.set(skill.name, (scores.get(skill.name) || 0) + idf * tfNorm);
124+
}
125+
}
126+
127+
return [...scores.entries()]
128+
.sort((a, b) => b[1] - a[1])
129+
.slice(0, 3)
130+
.map(([name, score]) => ({
131+
skill: this.allSkills.find((s) => s.name === name)!,
132+
score,
133+
source: "bm25" as const,
134+
}))
135+
.filter((r) => r.skill);
136+
}
137+
138+
getSkillCount(): number {
139+
return this.allSkills.length;
140+
}
141+
142+
getSkill(name: string): SkillMeta | undefined {
143+
return this.allSkills.find((s) => s.name === name);
144+
}
145+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import type { SearchEngine, SearchResult } from "../types.js";
2+
3+
export interface ModelJudgeConfig {
4+
provider?: string;
5+
apiKey?: string;
6+
baseUrl?: string;
7+
model?: string;
8+
}
9+
10+
/**
11+
* Small model judge search engine.
12+
*
13+
* Enabled when config.modelJudge has both provider and apiKey.
14+
* Currently a stub — search() returns [] but `available` reflects config.
15+
*
16+
* TODO: Implement actual judge logic:
17+
* - Call chat completion with a judge prompt listing all skill names + descriptions
18+
* - Parse model output for top-N matching skill names
19+
* - 3s timeout via Promise.race
20+
*
21+
* Recommended models (cheapest first):
22+
* gemini-2.0-flash-lite, gpt-4.1-nano, groq/llama-3.1-8b-instant
23+
*/
24+
export class ModelJudge implements SearchEngine {
25+
readonly name = "model-judge";
26+
readonly available: boolean;
27+
private config: ModelJudgeConfig;
28+
29+
constructor(cfg?: ModelJudgeConfig) {
30+
this.config = cfg ?? {};
31+
this.available = !!(this.config.apiKey && this.config.provider);
32+
}
33+
34+
async init(): Promise<void> {
35+
// TODO: validate config, warm up connection
36+
}
37+
38+
async search(_query: string): Promise<SearchResult[]> {
39+
// TODO: call small model with judge prompt → parse skill names → return results
40+
return [];
41+
}
42+
}

0 commit comments

Comments
 (0)