sourcebot/packages/mcp/src/index.ts at d5e44c07e4c606203bb96ea6cad3a80f2dbf1ed2 · sourcebot-dev/sourcebot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
#!/usr/bin/env node

// Entry point for the MCP server
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import _dedent from "dedent";
import escapeStringRegexp from 'escape-string-regexp';
import { z } from 'zod';
import { askCodebase, getFileSource, listCommits, listLanguageModels, listRepos, listTree, search } from './client.js';
import { env, numberSchema } from './env.js';
import { askCodebaseRequestSchema, DEFAULT_MAX_TREE_ENTRIES, DEFAULT_TREE_DEPTH, fileSourceRequestSchema, listCommitsQueryParamsSchema, listReposQueryParamsSchema, listTreeRequestSchema, MAX_MAX_TREE_ENTRIES, MAX_TREE_DEPTH } from './schemas.js';
import { AskCodebaseRequest, FileSourceRequest, ListCommitsQueryParamsSchema, ListReposQueryParams, ListTreeEntry, ListTreeRequest, TextContent } from './types.js';
import { buildTreeNodeIndex, joinTreePath, normalizeTreePath, sortTreeEntries } from './utils.js';

const dedent = _dedent.withOptions({ alignValues: true });

// Create MCP server
const server = new McpServer({
    name: 'sourcebot-mcp-server',
    version: '0.1.0',
});


server.tool(
    "search_code",
    dedent`
    Searches for code that matches the provided search query as a substring by default, or as a regular expression if useRegex is true. Useful for exploring remote repositories by searching for exact symbols, functions, variables, or specific code patterns. To determine if a repository is indexed, use the \`list_repos\` tool. By default, searches are global and will search the default branch of all repositories. Searches can be scoped to specific repositories, languages, and branches. When referencing code outputted by this tool, always include the file's external URL as a link. This makes it easier for the user to view the file, even if they don't have it locally checked out.
    `,
    {
        query: z
            .string()
            .describe(`The search pattern to match against code contents. Do not escape quotes in your query.`)
            // Escape backslashes first, then quotes, and wrap in double quotes
            // so the query is treated as a literal phrase (like grep).
            .transform((val) => {
                const escaped = val.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
                return `"${escaped}"`;
            }),
        useRegex: z
            .boolean()
            .describe(`Whether to use regular expression matching to match the search query against code contents. When false, substring matching is used. (default: false)`)
            .optional(),
        filterByRepos: z
            .array(z.string())
            .describe(`Scope the search to the provided repositories.`)
            .optional(),
        filterByLanguages: z
            .array(z.string())
            .describe(`Scope the search to the provided languages.`)
            .optional(),
        filterByFilepaths: z
            .array(z.string())
            .describe(`Scope the search to the provided filepaths.`)
            .optional(),
        caseSensitive: z
            .boolean()
            .describe(`Whether the search should be case sensitive (default: false).`)
            .optional(),
        includeCodeSnippets: z
            .boolean()
            .describe(`Whether to include the code snippets in the response. If false, only the file's URL, repository, and language will be returned. (default: false)`)
            .optional(),
        ref: z
            .string()
            .describe(`Commit SHA, branch or tag name to search on. If not provided, defaults to the default branch (usually 'main' or 'master').`)
            .optional(),
        maxTokens: numberSchema
            .describe(`The maximum number of tokens to return (default: ${env.DEFAULT_MINIMUM_TOKENS}). Higher values provide more context but consume more tokens. Values less than ${env.DEFAULT_MINIMUM_TOKENS} will be ignored.`)
            .transform((val) => (val < env.DEFAULT_MINIMUM_TOKENS ? env.DEFAULT_MINIMUM_TOKENS : val))
            .optional(),
    },
    async ({
        query,
        filterByRepos: repos = [],
        filterByLanguages: languages = [],
        filterByFilepaths: filepaths = [],
        maxTokens = env.DEFAULT_MINIMUM_TOKENS,
        includeCodeSnippets = false,
        caseSensitive = false,
        ref,
        useRegex = false,
    }) => {
        if (repos.length > 0) {
            query += ` (repo:${repos.map(id => escapeStringRegexp(id)).join(' or repo:')})`;
        }

        if (languages.length > 0) {
            query += ` (lang:${languages.join(' or lang:')})`;
        }

        if (filepaths.length > 0) {
            query += ` (file:${filepaths.map(filepath => escapeStringRegexp(filepath)).join(' or file:')})`;
        }

        if (ref) {
            query += ` ( rev:${ref} )`;
        }

        const response = await search({
            query,
            matches: env.DEFAULT_MATCHES,
            contextLines: env.DEFAULT_CONTEXT_LINES,
            isRegexEnabled: useRegex,
            isCaseSensitivityEnabled: caseSensitive,
        });

        if (response.files.length === 0) {
            return {
                content: [{
                    type: "text",
                    text: `No results found for the query: ${query}`,
                }],
            };
        }

        const content: TextContent[] = [];
        let totalTokens = 0;
        let isResponseTruncated = false;

        for (const file of response.files) {
            const numMatches = file.chunks.reduce(
                (acc, chunk) => acc + chunk.matchRanges.length,
                0,
            );
            let text = dedent`
            file: ${file.webUrl}
            num_matches: ${numMatches}
            repo: ${file.repository}
            language: ${file.language}
            `;

            if (includeCodeSnippets) {
                const snippets = file.chunks.map(chunk => {
                    return `\`\`\`\n${chunk.content}\n\`\`\``
                }).join('\n');
                text += `\n\n${snippets}`;
            }


            // Rough estimate of the number of tokens in the text
            // @see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
            const tokens = text.length / 4;

            if ((totalTokens + tokens) > maxTokens) {
                // Calculate remaining token budget
                const remainingTokens = maxTokens - totalTokens;

                if (remainingTokens > 100) {  // Only truncate if meaningful space left
                    // Truncate text to fit remaining tokens (tokens ≈ chars/4)
                    const maxLength = Math.floor(remainingTokens * 4);
                    const truncatedText = text.substring(0, maxLength) + "\n\n...[content truncated due to token limit]";

                    content.push({
                        type: "text",
                        text: truncatedText,
                    });

                    totalTokens += remainingTokens;
                }

                isResponseTruncated = true;
                break;
            }

            totalTokens += tokens;
            content.push({
                type: "text",
                text,
            });
        }

        if (isResponseTruncated) {
            content.push({
                type: "text",
                text: `The response was truncated because the number of tokens exceeded the maximum limit of ${maxTokens}.`,
            });
        }

        return {
            content,
        }
    }
);

server.tool(
    "list_commits",
    dedent`Get a list of commits for a given repository.`,
    listCommitsQueryParamsSchema.shape,
    async (request: ListCommitsQueryParamsSchema) => {
        const result = await listCommits(request);

        return {
            content: [{
                type: "text", text: JSON.stringify(result)
            }],
        };
    }
);

server.tool(
    "list_repos",
    dedent`Lists repositories in the organization with optional filtering and pagination.`,
    listReposQueryParamsSchema.shape,
    async (request: ListReposQueryParams) => {
        const result = await listRepos(request);

        return {
            content: [{
                type: "text", text: JSON.stringify({
                    repos: result.repos.map((repo) => ({
                        name: repo.repoName,
                        url: repo.webUrl,
                        pushedAt: repo.pushedAt,
                    })),
                    totalCount: result.totalCount,
                })
            }]
        };
    }
);

server.tool(
    "read_file",
    dedent`Reads the source code for a given file.`,
    fileSourceRequestSchema.shape,
    async (request: FileSourceRequest) => {
        const response = await getFileSource(request);

        return {
            content: [{
                type: "text", text: JSON.stringify({
                    source: response.source,
                    language: response.language,
                    path: response.path,
                    url: response.webUrl,
                })
            }]
        };
    }
);

server.tool(
    "list_tree",
    dedent`
    Lists files and directories from a repository path. This can be used as a repo tree tool or directory listing tool.
    Returns a flat list of entries with path metadata and depth relative to the requested path.
    `,
    listTreeRequestSchema.shape,
    async ({
        repo,
        path = '',
        ref = 'HEAD',
        depth = DEFAULT_TREE_DEPTH,
        includeFiles = true,
        includeDirectories = true,
        maxEntries = DEFAULT_MAX_TREE_ENTRIES,
    }: ListTreeRequest) => {
        const normalizedPath = normalizeTreePath(path);
        const normalizedDepth = Math.min(depth, MAX_TREE_DEPTH);
        const normalizedMaxEntries = Math.min(maxEntries, MAX_MAX_TREE_ENTRIES);

        if (!includeFiles && !includeDirectories) {
            return {
                content: [{
                    type: "text",
                    text: JSON.stringify({
                        repo,
                        ref,
                        path: normalizedPath,
                        entries: [] as ListTreeEntry[],
                        totalReturned: 0,
                        truncated: false,
                    }),
                }],
            };
        }

        // BFS frontier of directories still to expand. Each item stores a repo-relative
        // directory path plus the current depth from the requested root `path`.
        const queue: Array<{ path: string; depth: number }> = [{ path: normalizedPath, depth: 0 }];

        // Tracks directory paths that have already been enqueued.
        // With the current single-root traversal duplicates are uncommon, but this
        // prevents duplicate expansion if we later support overlapping multi-root
        // inputs (e.g. ["src", "src/lib"]) or receive overlapping tree data.
        const queuedPaths = new Set<string>([normalizedPath]);

        const seenEntries = new Set<string>();
        const entries: ListTreeEntry[] = [];
        let truncated = false;

        // Traverse breadth-first by depth, batching all directories at the same
        // depth into a single /api/tree request per iteration.
        while (queue.length > 0 && !truncated) {
            const currentDepth = queue[0]!.depth;
            const currentLevelPaths: string[] = [];

            // Drain only the current depth level so we can issue one API call
            // for all sibling directories before moving deeper.
            while (queue.length > 0 && queue[0]!.depth === currentDepth) {
                const next = queue.shift()!;
                currentLevelPaths.push(next.path);
            }

            // Ask Sourcebot for a tree spanning all requested paths at this level.
            const treeResponse = await listTree({
                repoName: repo,
                revisionName: ref,
                paths: currentLevelPaths.filter(Boolean),
            });
            const treeNodeIndex = buildTreeNodeIndex(treeResponse.tree);

            for (const currentPath of currentLevelPaths) {
                const currentNode = currentPath === '' ? treeResponse.tree : treeNodeIndex.get(currentPath);
                if (!currentNode || currentNode.type !== 'tree') {
                    // Skip paths that are missing from the response or resolve to a
                    // file node. We only iterate children of directories.
                    continue;
                }

                for (const child of currentNode.children) {
                    if (child.type !== 'tree' && child.type !== 'blob') {
                        // Skip non-standard git object types (e.g. unexpected entries)
                        // since this tool only exposes directories and files.
                        continue;
                    }

                    const childPath = joinTreePath(currentPath, child.name);
                    const childDepth = currentDepth + 1;

                    // Queue child directories for the next depth level only if
                    // they are within the requested depth bound.
                    if (child.type === 'tree' && childDepth < normalizedDepth && !queuedPaths.has(childPath)) {
                        queue.push({ path: childPath, depth: childDepth });
                        queuedPaths.add(childPath);
                    }

                    if ((child.type === 'blob' && !includeFiles) || (child.type === 'tree' && !includeDirectories)) {
                        // Skip entries filtered out by caller preferences
                        // (`includeFiles` / `includeDirectories`).
                        continue;
                    }

                    const key = `${child.type}:${childPath}`;
                    if (seenEntries.has(key)) {
                        // Skip duplicates when multiple requested paths overlap and
                        // surface the same child entry.
                        continue;
                    }
                    seenEntries.add(key);

                    // Stop collecting once the entry budget is exhausted.
                    if (entries.length >= normalizedMaxEntries) {
                        truncated = true;
                        break;
                    }

                    entries.push({
                        type: child.type,
                        path: childPath,
                        name: child.name,
                        parentPath: currentPath,
                        depth: childDepth,
                    });
                }

                if (truncated) {
                    break;
                }
            }
        }

        const sortedEntries = sortTreeEntries(entries);

        return {
            content: [{
                type: "text",
                text: JSON.stringify({
                    repo,
                    ref,
                    path: normalizedPath,
                    entries: sortedEntries,
                    totalReturned: sortedEntries.length,
                    truncated,
                }),
            }]
        };
    }
);

server.tool(
    "list_language_models",
    dedent`Lists the available language models configured on the Sourcebot instance. Use this to discover which models can be specified when calling ask_codebase.`,
    {},
    async () => {
        const models = await listLanguageModels();

        return {
            content: [{
                type: "text",
                text: JSON.stringify(models),
            }],
        };
    }
);

server.tool(
    "ask_codebase",
    dedent`
    Ask a natural language question about the codebase. This tool uses an AI agent to autonomously search code, read files, and find symbol references/definitions to answer your question.

    The agent will:
    - Analyze your question and determine what context it needs
    - Search the codebase using multiple strategies (code search, symbol lookup, file reading)
    - Synthesize findings into a comprehensive answer with code references

    Returns a detailed answer in markdown format with code references, plus a link to view the full research session (including all tool calls and reasoning) in the Sourcebot web UI.

    When using this in shared environments (e.g., Slack), you can set the visibility parameter to 'PUBLIC' to ensure everyone can access the chat link.

    This is a blocking operation that may take 30-60+ seconds for complex questions as the agent researches the codebase.
    `,
    askCodebaseRequestSchema.shape,
    async (request: AskCodebaseRequest) => {
        const response = await askCodebase(request);

        // Format the response with the answer and a link to the chat
        const formattedResponse = dedent`
        ${response.answer}

        ---
        **View full research session:** ${response.chatUrl}
        **Model used:** ${response.languageModel.model}
        `;

        return {
            content: [{
                type: "text",
                text: formattedResponse,
            }],
        };
    }
);

const runServer = async () => {
    const transport = new StdioServerTransport();
    await server.connect(transport);
}

runServer().catch((error) => {
    console.error('Failed to start MCP server:', error);
    process.exit(1);
});