Skip to content

Commit e497411

Browse files
committed
feat: add ask-self and ask-self-orig scaffolding
1 parent 4841dd8 commit e497411

8 files changed

Lines changed: 1589 additions & 0 deletions

File tree

ask-self-orig/helpers.js

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Pure-function helpers for the RAG module. No I/O, no global state — safe to
2+
// import from either the CJS query module (src/rag/index.js) or the ESM ingest
3+
// script (src/rag/ingest.mjs). Every function here is designed to be unit-tested
4+
// without mocks, network access, or a filesystem.
5+
6+
const CHUNK_TARGET_CHARS = 4800; // ~1200 tokens
7+
const CHUNK_OVERLAP_CHARS = 600; // ~150 tokens
8+
9+
// Priority boosts applied during retrieval re-rank. Higher priority wins at
10+
// near-equal distance. See the spike learning in PROJECT/2-WORKING/P1-CODE-RAG.md
11+
// for why strategy/changelog beat regular docs.
12+
const PRIORITY = {
13+
feature_map: 10,
14+
strategy: 5,
15+
changelog_entry: 5,
16+
doc: 1,
17+
pr: 1,
18+
};
19+
20+
/**
21+
* Split text into overlapping chunks sized for embedding. Returns the original
22+
* text as a single-element array if it's already short enough.
23+
*
24+
* @param {string} text
25+
* @param {{targetChars?: number, overlap?: number}} [options]
26+
* @returns {string[]}
27+
*/
28+
function chunkText(text, { targetChars = CHUNK_TARGET_CHARS, overlap = CHUNK_OVERLAP_CHARS } = {}) {
29+
if (typeof text !== 'string') throw new TypeError('chunkText: text must be a string');
30+
if (targetChars <= 0) throw new RangeError('chunkText: targetChars must be > 0');
31+
if (overlap < 0 || overlap >= targetChars) throw new RangeError('chunkText: overlap must be >= 0 and < targetChars');
32+
if (text.length === 0) return [];
33+
if (text.length <= targetChars) return [text];
34+
const chunks = [];
35+
let i = 0;
36+
while (i < text.length) {
37+
const end = Math.min(i + targetChars, text.length);
38+
chunks.push(text.slice(i, end));
39+
if (end >= text.length) break;
40+
i = end - overlap;
41+
}
42+
return chunks;
43+
}
44+
45+
/**
46+
* Split a CHANGELOG.md into one chunk per version entry. Version headers are
47+
* expected to match `## 1.2.3` at the start of a line. The returned version
48+
* string is the `1.2.3` capture from the header, or null if the chunk didn't
49+
* start with a version header (e.g., a preamble paragraph).
50+
*
51+
* @param {string} text
52+
* @returns {Array<{version: string|null, content: string}>}
53+
*/
54+
function chunkChangelog(text) {
55+
if (typeof text !== 'string') throw new TypeError('chunkChangelog: text must be a string');
56+
const parts = text.split(/(?=^##\s+\d+\.\d+\.\d+)/m).map((s) => s.trim()).filter(Boolean);
57+
return parts.map((entry) => {
58+
const versionMatch = entry.match(/^##\s+(\d+\.\d+\.\d+)/);
59+
return { version: versionMatch ? versionMatch[1] : null, content: entry };
60+
});
61+
}
62+
63+
/**
64+
* Classify a doc by its repo-relative path. Strategy/PMF/positioning docs and
65+
* the CHANGELOG get a priority boost so retrieval surfaces them for marketing
66+
* questions even when semantic distance is close.
67+
*
68+
* @param {string} relPath - repo-relative path (forward slashes)
69+
* @returns {{source: 'changelog'|'strategy'|'doc', priority: number}}
70+
*/
71+
function classifyDoc(relPath) {
72+
if (typeof relPath !== 'string' || relPath.length === 0) {
73+
throw new TypeError('classifyDoc: relPath must be a non-empty string');
74+
}
75+
if (/changelog\.md$/i.test(relPath)) {
76+
return { source: 'changelog', priority: PRIORITY.changelog_entry };
77+
}
78+
if (/strategy|product.*brief|moat|pmf|positioning/i.test(relPath)) {
79+
return { source: 'strategy', priority: PRIORITY.strategy };
80+
}
81+
return { source: 'doc', priority: PRIORITY.doc };
82+
}
83+
84+
/**
85+
* Format retrieved chunks into a single context string for the synthesis model,
86+
* budget-capped by total character count. Each chunk gets a source-aware header
87+
* so the model can cite it correctly. Oldest-first order is preserved.
88+
*
89+
* @param {Array<{source: string, path?: string, pr_number?: number|null, version?: string|null, content: string}>} hits
90+
* @param {number} [maxContextChars=80000]
91+
* @returns {string}
92+
*/
93+
function formatContext(hits, maxContextChars = 80000) {
94+
if (!Array.isArray(hits)) throw new TypeError('formatContext: hits must be an array');
95+
const parts = [];
96+
let totalChars = 0;
97+
for (const h of hits) {
98+
if (!h || typeof h.content !== 'string') continue;
99+
const header = h.source === 'pr'
100+
? `[PR #${h.pr_number}]`
101+
: h.source === 'changelog'
102+
? `[changelog.md${h.version ? ` — ${h.version}` : ''}]`
103+
: `[${h.path ?? h.source}]`;
104+
const block = `=== ${header} ===\n${h.content}\n`;
105+
if (totalChars + block.length > maxContextChars) break;
106+
parts.push(block);
107+
totalChars += block.length;
108+
}
109+
return parts.join('\n');
110+
}
111+
112+
module.exports = {
113+
CHUNK_TARGET_CHARS,
114+
CHUNK_OVERLAP_CHARS,
115+
PRIORITY,
116+
chunkText,
117+
chunkChangelog,
118+
classifyDoc,
119+
formatContext,
120+
};

ask-self-orig/index.js

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// Sleuth Code RAG — query module.
2+
// Exports askSelf(query, teamId) used by chat-module for the `ask-self` command.
3+
// Tenancy gate is layer 2 (module-level) per PROJECT/2-WORKING/P1-CODE-RAG.md.
4+
5+
const path = require('node:path');
6+
const fs = require('node:fs');
7+
const { formatContext } = require('./helpers.js');
8+
9+
const MODULE_DIR = __dirname;
10+
const REPO_ROOT = path.join(MODULE_DIR, '..', '..');
11+
const DB_PATH = path.join(REPO_ROOT, 'data', 'rag', 'sleuth-rag.sqlite');
12+
const PROMPTS_PATH = path.join(MODULE_DIR, 'prompts.json');
13+
14+
const EMBED_MODEL = 'gemini-embedding-001';
15+
const EMBED_DIM = 768;
16+
const SYNTHESIS_MODEL = 'gemini-pro-latest'; // rolling alias — always newest Gemini Pro
17+
const TOP_K = 20; // retrieve generously, trust Gemini to sort
18+
const PRIORITY_BOOST = 0.02; // small nudge — doesn't override clear semantic wins
19+
const MAX_CONTEXT_CHARS = 80000; // ~20k tokens — spike showed 18k works well
20+
21+
class TenancyError extends Error {
22+
constructor(message) {
23+
super(message);
24+
this.name = 'TenancyError';
25+
}
26+
}
27+
28+
// Lazy-loaded singletons so a missing env var at boot doesn't kill the process.
29+
// They throw on first askSelf() call instead, which chat-module catches silently.
30+
let _db = null;
31+
let _prompts = null;
32+
33+
function getDb() {
34+
if (_db) return _db;
35+
if (!fs.existsSync(DB_PATH)) {
36+
throw new Error(`RAG index missing at ${DB_PATH}. Run: npm run rag:ingest`);
37+
}
38+
// Lazy-require native modules so a broken install doesn't poison Sleuth startup
39+
// for workspaces that never touch ask-self.
40+
const Database = require('better-sqlite3');
41+
const sqliteVec = require('sqlite-vec');
42+
_db = new Database(DB_PATH, { readonly: true });
43+
sqliteVec.load(_db);
44+
return _db;
45+
}
46+
47+
function getPrompts() {
48+
if (_prompts) return _prompts;
49+
_prompts = JSON.parse(fs.readFileSync(PROMPTS_PATH, 'utf8'));
50+
return _prompts;
51+
}
52+
53+
function assertTenancy(teamId) {
54+
const allowed = process.env.NEOCHROME_TEAM_ID;
55+
if (typeof allowed !== 'string' || allowed.length === 0) {
56+
throw new TenancyError('NEOCHROME_TEAM_ID not configured');
57+
}
58+
if (typeof teamId !== 'string' || teamId.length === 0) {
59+
throw new TenancyError('teamId argument required');
60+
}
61+
if (teamId !== allowed) {
62+
throw new TenancyError('teamId does not match allowlist');
63+
}
64+
}
65+
66+
async function embedQuery(query) {
67+
const apiKey = process.env.GOOGLE_API_KEY;
68+
if (!apiKey) throw new Error('GOOGLE_API_KEY not set');
69+
const endpoint = `https://generativelanguage.googleapis.com/v1beta/models/${EMBED_MODEL}:embedContent?key=${apiKey}`;
70+
const res = await fetch(endpoint, {
71+
method: 'POST',
72+
headers: { 'Content-Type': 'application/json' },
73+
body: JSON.stringify({
74+
model: `models/${EMBED_MODEL}`,
75+
content: { parts: [{ text: query }] },
76+
taskType: 'RETRIEVAL_QUERY',
77+
outputDimensionality: EMBED_DIM,
78+
}),
79+
});
80+
if (!res.ok) throw new Error(`Gemini embed ${res.status}: ${(await res.text()).slice(0, 300)}`);
81+
const data = await res.json();
82+
const values = data?.embedding?.values;
83+
if (!Array.isArray(values) || values.length !== EMBED_DIM) {
84+
throw new Error(`Gemini embed: unexpected shape, got ${values?.length} dims`);
85+
}
86+
return new Uint8Array(new Float32Array(values).buffer);
87+
}
88+
89+
function knnSearch(db, queryVec, k = TOP_K) {
90+
const hits = db.prepare(
91+
'SELECT rowid, distance FROM chunks_vec WHERE embedding MATCH ? ORDER BY distance LIMIT ?'
92+
).all(queryVec, k);
93+
if (hits.length === 0) return [];
94+
const ids = hits.map((h) => Number(h.rowid));
95+
const placeholders = ids.map(() => '?').join(',');
96+
const rows = db.prepare(
97+
`SELECT id, source, path, pr_number, version, priority, content FROM chunks WHERE id IN (${placeholders})`
98+
).all(...ids);
99+
const byId = new Map(rows.map((r) => [Number(r.id), r]));
100+
// Re-rank with priority boost: lower score is better.
101+
// Drop hits whose metadata row is missing (e.g., partial/corrupt index) rather
102+
// than spreading undefined into the result and throwing. Missing rows are logged
103+
// once so an operator notices the drift instead of debugging silent gaps.
104+
const dropped = [];
105+
const ranked = [];
106+
for (const h of hits) {
107+
const row = byId.get(Number(h.rowid));
108+
if (!row) {
109+
dropped.push(h.rowid);
110+
continue;
111+
}
112+
const score = h.distance - (row.priority ?? 1) * PRIORITY_BOOST;
113+
ranked.push({ ...row, distance: h.distance, score });
114+
}
115+
if (dropped.length > 0) {
116+
console.warn(`[rag] knnSearch: dropped ${dropped.length} hit(s) with missing metadata rows (rowids: ${dropped.join(', ')}). Rebuild the index with: npm run rag:ingest`);
117+
}
118+
return ranked.sort((a, b) => a.score - b.score);
119+
}
120+
121+
async function synthesize(query, context, systemPrompt) {
122+
const apiKey = process.env.GOOGLE_API_KEY;
123+
const endpoint = `https://generativelanguage.googleapis.com/v1beta/models/${SYNTHESIS_MODEL}:generateContent?key=${apiKey}`;
124+
const userMessage = `CONTEXT (retrieved from Sleuth's own corpus):\n\n${context}\n\n---\n\nQUESTION: ${query}`;
125+
const body = {
126+
system_instruction: { parts: [{ text: systemPrompt }] },
127+
contents: [{ role: 'user', parts: [{ text: userMessage }] }],
128+
generationConfig: { temperature: 0.3, maxOutputTokens: 1500 },
129+
};
130+
const res = await fetch(endpoint, {
131+
method: 'POST',
132+
headers: { 'Content-Type': 'application/json' },
133+
body: JSON.stringify(body),
134+
});
135+
if (!res.ok) throw new Error(`Gemini synthesis ${res.status}: ${(await res.text()).slice(0, 300)}`);
136+
const data = await res.json();
137+
const text = data?.candidates?.[0]?.content?.parts?.[0]?.text;
138+
if (!text) throw new Error('Gemini synthesis: empty response');
139+
return text;
140+
}
141+
142+
/**
143+
* Answer a question about Sleuth itself, grounded in the local RAG index.
144+
* Strictly gated to the Neochrome workspace via NEOCHROME_TEAM_ID.
145+
*
146+
* @param {string} query - The question from the user.
147+
* @param {string} teamId - The Slack team ID of the workspace the question came from.
148+
* @returns {Promise<string>} - Formatted answer text to post back in Slack.
149+
* @throws {TenancyError} - If teamId does not match NEOCHROME_TEAM_ID.
150+
*/
151+
async function askSelf(query, teamId) {
152+
assertTenancy(teamId);
153+
if (typeof query !== 'string' || query.trim().length === 0) {
154+
throw new Error('query must be a non-empty string');
155+
}
156+
const prompts = getPrompts();
157+
const db = getDb();
158+
const queryVec = await embedQuery(query);
159+
const hits = knnSearch(db, queryVec, TOP_K);
160+
if (hits.length === 0) {
161+
return "I couldn't find anything in my index for that question. Try `npm run rag:ingest` or rephrase.";
162+
}
163+
const context = formatContext(hits, MAX_CONTEXT_CHARS);
164+
const answer = await synthesize(query, context, prompts.orchestrator_system);
165+
const sourcesList = [...new Set(hits.slice(0, 8).map((h) =>
166+
h.source === 'pr' ? `PR #${h.pr_number}` : h.path
167+
))];
168+
return `${answer}\n\n_Sources consulted: ${sourcesList.join(', ')}_`;
169+
}
170+
171+
module.exports = { askSelf, TenancyError };

0 commit comments

Comments
 (0)