@@ -122,49 +122,79 @@ Save the probe output before you start coding — you'll refer to it constantly.
122122
123123## 3. Register the language
124124
125- Three files, all small.
125+ Adding a language is ** one new file plus two registry lines** . The per-language
126+ registry (` src/extraction/languages/ ` ) is the single source of truth — extension
127+ maps, include globs, grammar config, and the EXTRACTORS lookup are all derived
128+ from it.
126129
127- ** ` src/types .ts ` ** — add to the ` Language ` union and to ` DEFAULT_CONFIG.include ` :
130+ ** Step 3a — Create ` src/extraction/languages/foo .ts ` ** with a ` LanguageDef ` :
128131
129132``` ts
130- export type Language =
131- | ' typescript'
132- | ...
133- | ' foo' // ← add here
134- | ' unknown' ;
133+ import type { LanguageDef } from ' ./types' ;
134+ import type { LanguageExtractor } from ' ../tree-sitter-types' ;
135+
136+ // Path A languages (procedural / OO — Python, Ruby, R) define a
137+ // LanguageExtractor here and reference it from the def below.
138+ export const fooExtractor: LanguageExtractor = {
139+ functionTypes: [' function_definition' ],
140+ classTypes: [' class_definition' ],
141+ // ... see Section 5a for the full shape
142+ };
135143
136- export const DEFAULT_CONFIG: CodeGraphConfig = {
137- ...
138- include : [
139- ...
140- ' **/*.foo' , // ← and here
141- ],
144+ export const FOO_DEF: LanguageDef = {
145+ name: ' foo' ,
146+ displayName: ' Foo' ,
147+ extensions: [' .foo' ],
148+ includeGlobs: [' **/*.foo' ],
149+ grammar: {
150+ wasmFile: ' tree-sitter-foo.wasm' ,
151+ vendored: true , // omit if the wasm lives in `tree-sitter-wasms`
152+ extractor: fooExtractor ,
153+ },
154+ // For Path B languages (HCL / SQL / Liquid — non-OO), set
155+ // customExtractor instead of (or in addition to) `extractor`:
156+ // customExtractor: (filePath, source) => new FooExtractor(filePath, source).extract(),
142157};
143158```
144159
145- ** ` src/extraction/grammars .ts ` ** — wire up the wasm path, extension map, and display name :
160+ ** Step 3b — Register in ` src/extraction/languages/registry .ts ` ** (2 lines) :
146161
147162``` ts
148- const WASM_GRAMMAR_FILES: Record <GrammarLanguage , string > = {
149- ...
150- foo : ' tree-sitter-foo.wasm' ,
151- };
152-
153- // If vendored under src/extraction/wasm/ instead of tree-sitter-wasms:
154- const VENDORED_WASM_LANGUAGES: ReadonlySet <GrammarLanguage > = new Set ([
155- ' pascal' ,
156- ' foo' , // ← add here
157- ]);
163+ import { FOO_DEF } from ' ./foo' ; // alphabetical
164+ // ...
165+ const ALL_DEFS: readonly LanguageDef [] = [
166+ // ... existing definitions, alphabetical
167+ FOO_DEF ,
168+ // ...
169+ ];
170+ ```
158171
159- export const EXTENSION_MAP: Record <string , Language > = {
160- ...
161- ' .foo' : ' foo' ,
162- };
172+ ** Step 3c — Add ` 'foo' ` to the ` Language ` union in ` src/types.ts ` ** (1 line):
163173
164- // And in getLanguageDisplayName():
165- foo : ' Foo' ,
174+ ``` ts
175+ export type Language =
176+ | ' typescript'
177+ | ...
178+ | ' foo' // ← add here
179+ | ' unknown' ;
166180```
167181
182+ That's it. ` DEFAULT_CONFIG.include ` , ` EXTENSION_MAP ` , the ` EXTRACTORS ` lookup,
183+ and ` getLanguageDisplayName() ` are all derived from the registry — no parallel
184+ lists to keep in sync.
185+
186+ The ` Language ` union update is the only spot that touches a shared file. New
187+ languages registered only via the registry (without a ` Language ` union entry)
188+ also work at runtime — the union is mostly for TypeScript narrowing in
189+ language-specific resolution code.
190+
191+ > ** Why per-file?** Two PRs adding two different languages used to collide on
192+ > the same ` EXTRACTORS ` map, the same ` EXTENSION_MAP ` , the same ` Language `
193+ > union, and the same ` WASM_GRAMMAR_FILES ` table. With per-file ` LanguageDef ` s,
194+ > two language PRs only conflict if their alphabetical positions in ` registry.ts `
195+ > happen to land on the same line — almost never. See ` src/extraction/languages/ `
196+ > for ~ 20 worked examples.
197+
168198** ` CLAUDE.md ` ** — append the language to the "Supported Languages" line so the
169199LLM-readable architecture doc stays in sync.
170200
@@ -221,14 +251,14 @@ export const fooExtractor: LanguageExtractor = {
221251};
222252```
223253
224- Then register it in ` src/extraction/languages/index.ts ` :
254+ Reference it from your ` LanguageDef ` (Section 3a) :
225255
226256``` ts
227- import { fooExtractor } from ' ./ foo' ;
228-
229- export const EXTRACTORS : Partial < Record < Language , LanguageExtractor >> = {
230- ...
231- foo : fooExtractor ,
257+ // in src/extraction/languages/ foo.ts
258+ export const FOO_DEF : LanguageDef = {
259+ name: ' foo ' ,
260+ // ...
261+ grammar: { wasmFile: ' tree-sitter- foo.wasm ' , vendored: true , extractor: fooExtractor } ,
232262};
233263```
234264
@@ -299,20 +329,30 @@ export class FooExtractor {
299329}
300330```
301331
302- Wire the dispatch in ` src/extraction/tree-sitter.ts ` :
332+ Wire the dispatch via ` customExtractor ` in your ` LanguageDef ` (Section 3a) :
303333
304334``` ts
305- import { FooExtractor } from ' ./foo-extractor' ;
306-
307- export function extractFromSource(filePath , source , language ? ) {
308- ...
309- if (detectedLanguage === ' foo' ) {
310- return new FooExtractor (filePath , source ).extract ();
311- }
312- ...
313- }
335+ // in src/extraction/languages/foo.ts
336+ import { FooExtractor } from ' ../foo-extractor' ;
337+ import type { LanguageDef } from ' ./types' ;
338+
339+ export const FOO_DEF: LanguageDef = {
340+ name: ' foo' ,
341+ displayName: ' Foo' ,
342+ extensions: [' .foo' ],
343+ includeGlobs: [' **/*.foo' ],
344+ // For languages that need a tree-sitter parser AND a custom extractor
345+ // (HCL, SQL): set both `grammar` and `customExtractor`. The grammar
346+ // entry only registers the wasm so the parser is available; the
347+ // customExtractor takes the dispatch.
348+ grammar: { wasmFile: ' tree-sitter-foo.wasm' , vendored: true , extractor: { /* skeleton */ } },
349+ customExtractor : (filePath , source ) => new FooExtractor (filePath , source ).extract (),
350+ };
314351```
315352
353+ The dispatch in ` src/extraction/tree-sitter.ts ` reads ` customExtractor ` off
354+ the language def — no per-language ` if ` branches to maintain.
355+
316356** Worked examples:**
317357
318358- ` src/extraction/hcl-extractor.ts ` — Terraform / HCL. Block-based DDL. Each
0 commit comments