Skip to content

Commit b7a6b0e

Browse files
committed
feat(engine): MATLAB extractor with .m disambiguation from ObjC
MATLAB shares .m with Objective-C. detectLanguage() now sniffs the first 4KB of file content: ObjC markers (@interface/@implementation/#import/ #include) → objc; otherwise MATLAB. EXTENSION_MAP still maps .m → objc as the default; the heuristic only overrides when no ObjC markers exist. Vendored grammar: acristoffers/tree-sitter-matlab (built locally via tree-sitter-cli; upstream ships only Python wheels).
1 parent d14295e commit b7a6b0e

6 files changed

Lines changed: 335 additions & 2 deletions

File tree

engine/PATCHES.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,15 @@ Perl — each documented as a numbered entry below when it lands.
3636
- `const_statement` wraps an inner `assignment` node
3737
- `macro_definition` has the same structure as `function_definition`
3838
- Call edges work via the standard `call_expression` in `callTypes`; `visitFunctionBody` recursively finds them
39+
40+
### MATLAB (added 2026-05-30)
41+
- WASM grammar: `engine/src/extraction/wasm/tree-sitter-matlab.wasm` built locally from acristoffers/tree-sitter-matlab (upstream ships only Python wheels)
42+
- Extension map: `.m` is shared with Objective-C; disambiguated by `detectLanguage(filePath, content)` content heuristic that checks for ObjC markers (`@interface`, `@implementation`, `#import`, `#include`) in the first 4 KB. `EXTENSION_MAP` still maps `.m``objc` as the default; the heuristic overrides to `matlab` only when no ObjC markers are found.
43+
- Extractor: `engine/src/extraction/languages/matlab.ts`
44+
- Tests: extraction + disambiguation tests in `engine/__tests__/extraction.test.ts` (`MATLAB Extraction` describe block; 14 tests total)
45+
- Notes: MATLAB grammar (acristoffers/tree-sitter-matlab) key AST facts confirmed empirically:
46+
- `function_definition`: field `'name'` → function identifier; `function_output` optional named child for return values; `function_arguments` named child for params; `block` named child for body
47+
- Three function forms: `function greet()`, `function result = hello(name)`, `function [a,b] = swap(x,y)` — all produce `function_definition` nodes with the same structure
48+
- `function_call`: field `'name'` → callee identifier; used for call edges via `callTypes: ['function_call']`
49+
- `assignment`: fields `'left'` and `'right'`; top-level identifier-lhs assignments → variable nodes
50+
- Grammar does NOT use `call_expression` (unlike most other languages); uses `function_call` instead

engine/__tests__/extraction.test.ts

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4571,3 +4571,161 @@ const MAX_SIZE = 1000
45714571
expect(names).toContain('MAX_SIZE');
45724572
});
45734573
});
4574+
4575+
describe('MATLAB Extraction', () => {
4576+
// ── Language detection ────────────────────────────────────────────────────
4577+
it('should detect MATLAB files via content heuristic (no ObjC markers)', () => {
4578+
// .m with MATLAB content → matlab
4579+
const matlabContent = `function result = hello(name)\n result = name;\nend\n`;
4580+
expect(detectLanguage('hello.m', matlabContent)).toBe('matlab');
4581+
});
4582+
4583+
it('should detect ObjC .m files that have @interface', () => {
4584+
const objcContent = `@interface Foo : NSObject\n- (void)bar;\n@end\n`;
4585+
expect(detectLanguage('Foo.m', objcContent)).toBe('objc');
4586+
});
4587+
4588+
it('should detect ObjC .m files that have @implementation', () => {
4589+
const objcContent = `@implementation Foo\n- (void)bar {}\n@end\n`;
4590+
expect(detectLanguage('Foo.m', objcContent)).toBe('objc');
4591+
});
4592+
4593+
it('should detect ObjC .m files that have #import', () => {
4594+
const objcContent = `#import <Foundation/Foundation.h>\n\n@implementation Foo\n@end\n`;
4595+
expect(detectLanguage('Foo.m', objcContent)).toBe('objc');
4596+
});
4597+
4598+
it('should default to matlab for .m with no ObjC markers (just comments)', () => {
4599+
const commentOnly = `% This is a MATLAB comment\n% No function defined yet\n`;
4600+
expect(detectLanguage('script.m', commentOnly)).toBe('matlab');
4601+
});
4602+
4603+
it('should fall back to objc for .m with no content provided', () => {
4604+
// When source is omitted, EXTENSION_MAP default (objc) is returned
4605+
expect(detectLanguage('unknown.m')).toBe('objc');
4606+
});
4607+
4608+
it('should report MATLAB as supported', () => {
4609+
expect(isLanguageSupported('matlab')).toBe(true);
4610+
expect(getSupportedLanguages()).toContain('matlab');
4611+
});
4612+
4613+
// ── File node ─────────────────────────────────────────────────────────────
4614+
it('should create a file node for MATLAB files', () => {
4615+
const code = `
4616+
function result = hello(name)
4617+
result = ['Hello, ', name];
4618+
end
4619+
`;
4620+
const result = extractFromSource('hello.m', code);
4621+
4622+
const fileNode = result.nodes.find((n) => n.kind === 'file');
4623+
expect(fileNode).toBeDefined();
4624+
expect(fileNode?.name).toBe('hello.m');
4625+
expect(fileNode?.language).toBe('matlab');
4626+
});
4627+
4628+
// ── Function extraction ───────────────────────────────────────────────────
4629+
it('should extract a simple void function (no output)', () => {
4630+
const code = `
4631+
function greet()
4632+
disp('hello');
4633+
end
4634+
`;
4635+
const result = extractFromSource('greet.m', code);
4636+
4637+
const functions = result.nodes.filter((n) => n.kind === 'function');
4638+
expect(functions).toHaveLength(1);
4639+
expect(functions[0].name).toBe('greet');
4640+
expect(functions[0].language).toBe('matlab');
4641+
});
4642+
4643+
it('should extract a function with a single return value', () => {
4644+
const code = `
4645+
function result = hello(name)
4646+
result = ['Hello, ', name];
4647+
end
4648+
`;
4649+
const result = extractFromSource('hello.m', code);
4650+
4651+
const funcNode = result.nodes.find((n) => n.kind === 'function');
4652+
expect(funcNode).toBeDefined();
4653+
expect(funcNode?.name).toBe('hello');
4654+
expect(funcNode?.signature).toContain('hello');
4655+
});
4656+
4657+
it('should extract a function with multiple return values', () => {
4658+
const code = `
4659+
function [a, b] = swap(x, y)
4660+
a = y;
4661+
b = x;
4662+
end
4663+
`;
4664+
const result = extractFromSource('swap.m', code);
4665+
4666+
const funcNode = result.nodes.find((n) => n.kind === 'function');
4667+
expect(funcNode).toBeDefined();
4668+
expect(funcNode?.name).toBe('swap');
4669+
});
4670+
4671+
it('should extract multiple function definitions', () => {
4672+
const code = `
4673+
function result = hello(name)
4674+
result = ['Hello, ', name];
4675+
end
4676+
4677+
function greet()
4678+
hello('world');
4679+
end
4680+
4681+
function main()
4682+
greet();
4683+
end
4684+
`;
4685+
const result = extractFromSource('funcs.m', code);
4686+
4687+
const functions = result.nodes.filter((n) => n.kind === 'function');
4688+
const names = functions.map((f) => f.name).sort();
4689+
expect(names).toContain('hello');
4690+
expect(names).toContain('greet');
4691+
expect(names).toContain('main');
4692+
});
4693+
4694+
// ── Call edges ────────────────────────────────────────────────────────────
4695+
it('should extract call edges from function bodies', () => {
4696+
const code = `
4697+
function result = hello(name)
4698+
result = ['Hello, ', name];
4699+
end
4700+
4701+
function greet()
4702+
hello('world');
4703+
end
4704+
4705+
function main()
4706+
greet();
4707+
end
4708+
`;
4709+
const result = extractFromSource('calls.m', code);
4710+
4711+
const calls = result.unresolvedReferences.filter((r) => r.referenceKind === 'calls');
4712+
const callNames = calls.map((c) => c.referenceName);
4713+
4714+
expect(callNames).toContain('hello');
4715+
expect(callNames).toContain('greet');
4716+
});
4717+
4718+
// ── Variable extraction ───────────────────────────────────────────────────
4719+
it('should extract top-level variable assignments', () => {
4720+
const code = `
4721+
x = hello('test');
4722+
y = 42;
4723+
`;
4724+
const result = extractFromSource('script.m', code);
4725+
4726+
const variables = result.nodes.filter((n) => n.kind === 'variable');
4727+
const names = variables.map((v) => v.name);
4728+
expect(names).toContain('x');
4729+
expect(names).toContain('y');
4730+
});
4731+
});

engine/src/extraction/grammars.ts

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
4040
objc: 'tree-sitter-objc.wasm',
4141
r: 'tree-sitter-r.wasm',
4242
julia: 'tree-sitter-julia.wasm',
43+
matlab: 'tree-sitter-matlab.wasm',
4344
};
4445

4546
/**
@@ -184,7 +185,7 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise<v
184185
// ABI-13 build that corrupts the shared WASM heap under web-tree-sitter
185186
// 0.25 (drops nested calls/imports on every file after the first); we
186187
// vendor the upstream ABI-15 wasm instead.
187-
const wasmPath = (lang === 'pascal' || lang === 'scala' || lang === 'lua' || lang === 'luau' || lang === 'r' || lang === 'julia')
188+
const wasmPath = (lang === 'pascal' || lang === 'scala' || lang === 'lua' || lang === 'luau' || lang === 'r' || lang === 'julia' || lang === 'matlab')
188189
? path.join(__dirname, 'wasm', wasmFile)
189190
: require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
190191
const language = await WasmLanguage.load(wasmPath);
@@ -234,7 +235,15 @@ export function getParser(language: Language): Parser | null {
234235
}
235236

236237
/**
237-
* Detect language from file extension
238+
* Detect language from file extension, with content-based disambiguation
239+
* for ambiguous extensions.
240+
*
241+
* Currently disambiguates:
242+
* .h — C vs C++ vs Objective-C (existing behaviour)
243+
* .m — Objective-C vs MATLAB: ObjC has distinctive top-of-file markers
244+
* (@interface, @implementation, #import, #include). MATLAB has none.
245+
* If any ObjC marker appears in the first ~4 KB, treat as ObjC;
246+
* otherwise MATLAB.
238247
*/
239248
export function detectLanguage(filePath: string, source?: string): Language {
240249
// Play `conf/routes` has no grammar — route through the no-symbol path; the
@@ -249,6 +258,13 @@ export function detectLanguage(filePath: string, source?: string): Language {
249258
if (looksLikeObjc(source)) return 'objc';
250259
}
251260

261+
// .m files could be Objective-C or MATLAB — check source content
262+
// EXTENSION_MAP maps .m → objc as the safe default; override to matlab
263+
// only when no ObjC markers are present.
264+
if (lang === 'objc' && ext === '.m' && source) {
265+
if (!looksLikeObjc4kb(source)) return 'matlab';
266+
}
267+
252268
return lang;
253269
}
254270

@@ -269,6 +285,16 @@ function looksLikeObjc(source: string): boolean {
269285
return /@(?:interface|implementation|protocol|synthesize)\b/.test(sample);
270286
}
271287

288+
/**
289+
* Heuristic: does a .m file look like Objective-C (vs MATLAB)?
290+
* Checks the first ~4 KB for ObjC-specific top-of-file markers.
291+
* MATLAB files never start with @interface/@implementation or #import/#include.
292+
*/
293+
function looksLikeObjc4kb(source: string): boolean {
294+
const sample = source.slice(0, 4096);
295+
return /^\s*(@interface\b|@implementation\b|#import\b|#include\b)/m.test(sample);
296+
}
297+
272298
/**
273299
* Check if a language is supported (has a grammar defined).
274300
* Returns true if the grammar exists, even if not yet loaded.
@@ -372,6 +398,7 @@ export function getLanguageDisplayName(language: Language): string {
372398
objc: 'Objective-C',
373399
r: 'R',
374400
julia: 'Julia',
401+
matlab: 'MATLAB',
375402
yaml: 'YAML',
376403
twig: 'Twig',
377404
xml: 'XML',

engine/src/extraction/languages/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import { luauExtractor } from './luau';
2828
import { objcExtractor } from './objc';
2929
import { rExtractor } from './r';
3030
import { juliaExtractor } from './julia';
31+
import { matlabExtractor } from './matlab';
3132

3233
export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
3334
typescript: typescriptExtractor,
@@ -53,4 +54,5 @@ export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
5354
objc: objcExtractor,
5455
r: rExtractor,
5556
julia: juliaExtractor,
57+
matlab: matlabExtractor,
5658
};
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import { getNodeText, getChildByField } from '../tree-sitter-helpers';
2+
import type { LanguageExtractor, ExtractorContext } from '../tree-sitter-types';
3+
import type { Node as SyntaxNode } from 'web-tree-sitter';
4+
5+
/**
6+
* MATLAB Language Extractor
7+
*
8+
* MATLAB function syntax:
9+
* function greet() — no output
10+
* function result = hello(name) — single output
11+
* function [a, b] = swap(x, y) — multiple outputs
12+
*
13+
* Grammar used: acristoffers/tree-sitter-matlab (vendored WASM)
14+
*
15+
* Key node kinds confirmed by AST inspection:
16+
* function_definition — function definitions (all three forms above)
17+
* field 'name': identifier — the function name
18+
* field 'function_arguments': argument list node
19+
* function_output (optional named child): single or multi-return marker
20+
* block: function body
21+
*
22+
* function_call — foo(args)
23+
* field 'name': identifier — callee name
24+
*
25+
* assignment — x = expr
26+
* field 'left': lhs (identifier or complex expression)
27+
* field 'right': rhs
28+
*
29+
* MATLAB has no class or import syntax in the file-based sense (classdef
30+
* exists but is handled separately; scripts have no module imports). We
31+
* focus on functions, calls, and top-level variable assignments.
32+
*/
33+
34+
export const matlabExtractor: LanguageExtractor = {
35+
// All interesting patterns handled by visitNode
36+
functionTypes: [],
37+
classTypes: [],
38+
methodTypes: [],
39+
interfaceTypes: [],
40+
structTypes: [],
41+
enumTypes: [],
42+
typeAliasTypes: [],
43+
importTypes: [],
44+
callTypes: ['function_call'],
45+
variableTypes: [],
46+
47+
// Field names — used as fallbacks; primary extraction is in visitNode
48+
nameField: 'name',
49+
bodyField: 'body',
50+
paramsField: 'function_arguments',
51+
returnField: undefined,
52+
53+
isAsync: () => false,
54+
isStatic: () => false,
55+
56+
extractImport: () => null,
57+
58+
/**
59+
* Custom visitor for MATLAB's AST.
60+
*
61+
* Handles:
62+
* function_definition → 'function' node (all return forms)
63+
* assignment with identifier lhs → 'variable' node (top-level only)
64+
*/
65+
visitNode: (node: SyntaxNode, ctx: ExtractorContext): boolean => {
66+
const type = node.type;
67+
68+
// ── function_definition ────────────────────────────────────────────────
69+
if (type === 'function_definition') {
70+
// The 'name' field always holds the function identifier
71+
const nameNode = getChildByField(node, 'name');
72+
if (!nameNode) return false;
73+
const name = getNodeText(nameNode, ctx.source);
74+
75+
// Build signature string: [outputs] = name(params)
76+
// Find function_output (optional) and function_arguments among named children
77+
let outputText = '';
78+
let paramsText = '';
79+
let bodyNode: SyntaxNode | null = null;
80+
81+
for (let i = 0; i < node.childCount; i++) {
82+
const child = node.child(i);
83+
if (!child || !child.isNamed) continue;
84+
if (child.type === 'function_output') {
85+
outputText = getNodeText(child, ctx.source); // e.g. "result =" or "[a, b] ="
86+
} else if (child.type === 'function_arguments') {
87+
paramsText = getNodeText(child, ctx.source); // e.g. "(name)" or "()"
88+
} else if (child.type === 'block') {
89+
bodyNode = child;
90+
}
91+
}
92+
93+
const signature = outputText
94+
? `${outputText} ${name}${paramsText}`
95+
: `${name}${paramsText}`;
96+
97+
const funcNode = ctx.createNode('function', name, node, {
98+
signature,
99+
isAsync: false,
100+
isStatic: false,
101+
});
102+
if (funcNode && bodyNode) {
103+
ctx.pushScope(funcNode.id);
104+
ctx.visitFunctionBody(bodyNode, funcNode.id);
105+
ctx.popScope();
106+
}
107+
return true;
108+
}
109+
110+
// ── assignment ─────────────────────────────────────────────────────────
111+
if (type === 'assignment') {
112+
const lhsNode = getChildByField(node, 'left');
113+
const rhsNode = getChildByField(node, 'right');
114+
if (!lhsNode || !rhsNode) return false;
115+
116+
// Only extract simple identifier assignments at script scope
117+
if (lhsNode.type !== 'identifier') return false;
118+
119+
const name = getNodeText(lhsNode, ctx.source);
120+
const initValue = getNodeText(rhsNode, ctx.source).slice(0, 100);
121+
const initSignature = initValue
122+
? `= ${initValue}${initValue.length >= 100 ? '...' : ''}`
123+
: undefined;
124+
125+
ctx.createNode('variable', name, node, { signature: initSignature });
126+
// Visit rhs to capture any calls in the initializer
127+
ctx.visitNode(rhsNode);
128+
return true;
129+
}
130+
131+
return false;
132+
},
133+
};

0 commit comments

Comments
 (0)