Skip to content

Commit 89a772b

Browse files
committed
feat(engine): Perl language extractor (.pl/.pm/.t files)
Adds tree-sitter-perl grammar (tree-sitter-perl/tree-sitter-perl v1.0.2) wired through the standard LanguageExtractor pattern. Extracts named subroutines, function calls, package declarations, and use/require imports.
1 parent b7a6b0e commit 89a772b

6 files changed

Lines changed: 398 additions & 1 deletion

File tree

engine/PATCHES.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,22 @@ Perl — each documented as a numbered entry below when it lands.
3737
- `macro_definition` has the same structure as `function_definition`
3838
- Call edges work via the standard `call_expression` in `callTypes`; `visitFunctionBody` recursively finds them
3939

40+
### Perl (added 2026-05-30)
41+
- WASM grammar: `engine/src/extraction/wasm/tree-sitter-perl.wasm` from tree-sitter-perl/tree-sitter-perl v1.0.2
42+
- Extension map: `.pl`, `.pm`, `.t``perl`
43+
- Extractor: `engine/src/extraction/languages/perl.ts`
44+
- Tests: added `Perl Extraction` describe block in `engine/__tests__/extraction.test.ts` (10 tests)
45+
- Notes: Perl grammar (tree-sitter-perl v1.0.2) key AST facts confirmed empirically:
46+
- `subroutine_declaration_statement`: namedChild[0]=`bareword` (name), namedChild[1]=`block` (body); no named fields
47+
- `function_call_expression`: child[0] has type `function` (callee); no named fields
48+
- `ambiguous_function_call_expression`: like function_call_expression but wraps e.g. `print foo()`; child[0] type=`function`
49+
- `method_call_expression`: `$obj->method()` or `Class->method()`; child[2] type=`method` (method name)
50+
- `use_statement`: namedChild[0] is a `package` node holding the module name
51+
- `require_expression`: namedChild[0] is a `bareword` holding the module name
52+
- `package_statement`: namedChild[0] is a `package` node holding the package name (re-uses `package` type for both the keyword and the identifier)
53+
- The grammar does NOT use named fields (childForFieldName returns null); all extraction uses `namedChild(i)` or `child(i)` by index
54+
- `resolveName` hook is used to extract callee names from all three call node types
55+
4056
### MATLAB (added 2026-05-30)
4157
- WASM grammar: `engine/src/extraction/wasm/tree-sitter-matlab.wasm` built locally from acristoffers/tree-sitter-matlab (upstream ships only Python wheels)
4258
- Extension map: `.m` is shared with Objective-C; disambiguated by `detectLanguage(filePath, content)` content heuristic that checks for ObjC markers (`@interface`, `@implementation`, `#import`, `#include`) in the first 4 KB. `EXTENSION_MAP` still maps `.m``objc` as the default; the heuristic overrides to `matlab` only when no ObjC markers are found.

engine/__tests__/extraction.test.ts

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4729,3 +4729,205 @@ y = 42;
47294729
expect(names).toContain('y');
47304730
});
47314731
});
4732+
4733+
describe('Perl Extraction', () => {
4734+
// ── Language detection ────────────────────────────────────────────────────
4735+
it('should detect Perl files by extension', () => {
4736+
expect(detectLanguage('script.pl')).toBe('perl');
4737+
expect(detectLanguage('MyModule.pm')).toBe('perl');
4738+
expect(detectLanguage('foo.t')).toBe('perl');
4739+
expect(detectLanguage('src/lib/Utils.pm')).toBe('perl');
4740+
});
4741+
4742+
it('should report Perl as supported', () => {
4743+
expect(isLanguageSupported('perl')).toBe(true);
4744+
expect(getSupportedLanguages()).toContain('perl');
4745+
});
4746+
4747+
// ── File node ─────────────────────────────────────────────────────────────
4748+
it('should create a file node for Perl files', () => {
4749+
const code = `
4750+
package MyPackage;
4751+
4752+
sub hello {
4753+
my ($name) = @_;
4754+
return "Hello";
4755+
}
4756+
4757+
1;
4758+
`;
4759+
const result = extractFromSource('MyPackage.pm', code);
4760+
4761+
const fileNode = result.nodes.find((n) => n.kind === 'file');
4762+
expect(fileNode).toBeDefined();
4763+
expect(fileNode?.name).toBe('MyPackage.pm');
4764+
expect(fileNode?.language).toBe('perl');
4765+
});
4766+
4767+
// ── Function (sub) extraction ─────────────────────────────────────────────
4768+
it('should extract named subroutines', () => {
4769+
const code = `
4770+
package MyPackage;
4771+
4772+
sub hello {
4773+
my ($name) = @_;
4774+
return "Hello";
4775+
}
4776+
4777+
sub greet {
4778+
return hello("world");
4779+
}
4780+
4781+
sub main {
4782+
print greet();
4783+
}
4784+
4785+
1;
4786+
`;
4787+
const result = extractFromSource('script.pl', code);
4788+
4789+
const functions = result.nodes.filter((n) => n.kind === 'function');
4790+
const names = functions.map((f) => f.name).sort();
4791+
expect(names).toContain('hello');
4792+
expect(names).toContain('greet');
4793+
expect(names).toContain('main');
4794+
expect(functions[0].language).toBe('perl');
4795+
});
4796+
4797+
it('should include sub name in function signature', () => {
4798+
const code = `
4799+
sub hello {
4800+
return "hi";
4801+
}
4802+
`;
4803+
const result = extractFromSource('hello.pl', code);
4804+
4805+
const funcNode = result.nodes.find((n) => n.kind === 'function');
4806+
expect(funcNode).toBeDefined();
4807+
expect(funcNode?.name).toBe('hello');
4808+
expect(funcNode?.signature).toContain('hello');
4809+
});
4810+
4811+
// ── Call edges ────────────────────────────────────────────────────────────
4812+
it('should extract function call edges', () => {
4813+
const code = `
4814+
sub hello {
4815+
return "Hello";
4816+
}
4817+
4818+
sub greet {
4819+
return hello("world");
4820+
}
4821+
4822+
sub main {
4823+
print greet();
4824+
}
4825+
`;
4826+
const result = extractFromSource('calls.pl', code);
4827+
4828+
const calls = result.unresolvedReferences.filter((r) => r.referenceKind === 'calls');
4829+
const callNames = calls.map((c) => c.referenceName);
4830+
expect(callNames).toContain('hello');
4831+
expect(callNames).toContain('greet');
4832+
});
4833+
4834+
// ── Package declaration ───────────────────────────────────────────────────
4835+
it('should extract package declaration as a module node', () => {
4836+
const code = `
4837+
package MyPackage;
4838+
4839+
sub hello { return "hi"; }
4840+
4841+
1;
4842+
`;
4843+
const result = extractFromSource('MyPackage.pm', code);
4844+
4845+
const modules = result.nodes.filter((n) => n.kind === 'module');
4846+
expect(modules).toHaveLength(1);
4847+
expect(modules[0].name).toBe('MyPackage');
4848+
expect(modules[0].language).toBe('perl');
4849+
});
4850+
4851+
// ── use/require imports ───────────────────────────────────────────────────
4852+
it('should extract use statements as import nodes', () => {
4853+
const code = `
4854+
use strict;
4855+
use warnings;
4856+
use MyModule qw(foo bar);
4857+
`;
4858+
const result = extractFromSource('script.pl', code);
4859+
4860+
const imports = result.nodes.filter((n) => n.kind === 'import');
4861+
const names = imports.map((i) => i.name);
4862+
expect(names).toContain('strict');
4863+
expect(names).toContain('warnings');
4864+
expect(names).toContain('MyModule');
4865+
});
4866+
4867+
it('should extract require expressions as import nodes', () => {
4868+
const code = `
4869+
require AnotherModule;
4870+
require 'some_file.pl';
4871+
`;
4872+
const result = extractFromSource('script.pl', code);
4873+
4874+
const imports = result.nodes.filter((n) => n.kind === 'import');
4875+
const names = imports.map((i) => i.name);
4876+
expect(names).toContain('AnotherModule');
4877+
});
4878+
4879+
// ── Full file extraction ──────────────────────────────────────────────────
4880+
it('should extract a complete Perl module correctly', () => {
4881+
const code = `
4882+
package MyPackage;
4883+
4884+
use strict;
4885+
use warnings;
4886+
use MyModule qw(foo bar);
4887+
require AnotherModule;
4888+
4889+
sub hello {
4890+
my ($name) = @_;
4891+
return "Hello";
4892+
}
4893+
4894+
sub greet {
4895+
return hello("world");
4896+
}
4897+
4898+
sub main {
4899+
print greet();
4900+
}
4901+
4902+
1;
4903+
`;
4904+
const result = extractFromSource('MyPackage.pm', code);
4905+
4906+
// File node
4907+
expect(result.nodes.find((n) => n.kind === 'file')).toBeDefined();
4908+
4909+
// Package → module node
4910+
const modules = result.nodes.filter((n) => n.kind === 'module');
4911+
expect(modules.map((m) => m.name)).toContain('MyPackage');
4912+
4913+
// Subs → function nodes
4914+
const functions = result.nodes.filter((n) => n.kind === 'function');
4915+
const funcNames = functions.map((f) => f.name);
4916+
expect(funcNames).toContain('hello');
4917+
expect(funcNames).toContain('greet');
4918+
expect(funcNames).toContain('main');
4919+
4920+
// use/require → import nodes
4921+
const imports = result.nodes.filter((n) => n.kind === 'import');
4922+
const importNames = imports.map((i) => i.name);
4923+
expect(importNames).toContain('strict');
4924+
expect(importNames).toContain('MyModule');
4925+
expect(importNames).toContain('AnotherModule');
4926+
4927+
// Call edges
4928+
const calls = result.unresolvedReferences.filter((r) => r.referenceKind === 'calls');
4929+
const callNames = calls.map((c) => c.referenceName);
4930+
expect(callNames).toContain('hello');
4931+
expect(callNames).toContain('greet');
4932+
});
4933+
});

engine/src/extraction/grammars.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
4141
r: 'tree-sitter-r.wasm',
4242
julia: 'tree-sitter-julia.wasm',
4343
matlab: 'tree-sitter-matlab.wasm',
44+
perl: 'tree-sitter-perl.wasm',
4445
};
4546

4647
/**
@@ -101,6 +102,9 @@ export const EXTENSION_MAP: Record<string, Language> = {
101102
'.R': 'r',
102103
'.r': 'r',
103104
'.jl': 'julia',
105+
'.pl': 'perl',
106+
'.pm': 'perl',
107+
'.t': 'perl',
104108
// XML: file-level tracking; the MyBatis extractor matches `<mapper namespace="...">`
105109
// shape and emits SQL-statement nodes (other XML returns empty).
106110
'.xml': 'xml',
@@ -185,7 +189,7 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise<v
185189
// ABI-13 build that corrupts the shared WASM heap under web-tree-sitter
186190
// 0.25 (drops nested calls/imports on every file after the first); we
187191
// vendor the upstream ABI-15 wasm instead.
188-
const wasmPath = (lang === 'pascal' || lang === 'scala' || lang === 'lua' || lang === 'luau' || lang === 'r' || lang === 'julia' || lang === 'matlab')
192+
const wasmPath = (lang === 'pascal' || lang === 'scala' || lang === 'lua' || lang === 'luau' || lang === 'r' || lang === 'julia' || lang === 'matlab' || lang === 'perl')
189193
? path.join(__dirname, 'wasm', wasmFile)
190194
: require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
191195
const language = await WasmLanguage.load(wasmPath);
@@ -399,6 +403,7 @@ export function getLanguageDisplayName(language: Language): string {
399403
r: 'R',
400404
julia: 'Julia',
401405
matlab: 'MATLAB',
406+
perl: 'Perl',
402407
yaml: 'YAML',
403408
twig: 'Twig',
404409
xml: 'XML',

engine/src/extraction/languages/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import { objcExtractor } from './objc';
2929
import { rExtractor } from './r';
3030
import { juliaExtractor } from './julia';
3131
import { matlabExtractor } from './matlab';
32+
import { perlExtractor } from './perl';
3233

3334
export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
3435
typescript: typescriptExtractor,
@@ -55,4 +56,5 @@ export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
5556
r: rExtractor,
5657
julia: juliaExtractor,
5758
matlab: matlabExtractor,
59+
perl: perlExtractor,
5860
};

0 commit comments

Comments
 (0)