Skip to content

Commit ef4ad0c

Browse files
committed
Add MATLAB, Lean 4, FORM, and Magma language support (59 -> 63)
Custom tree-sitter grammars for FORM and Magma (no upstream exists). MATLAB uses acristoffers/tree-sitter-matlab, Lean uses leanprover/tree-sitter-lean4. Includes .m file disambiguation (Objective-C vs MATLAB) via Linguist heuristics.
2 parents 57dd8f4 + 04075ad commit ef4ad0c

56 files changed

Lines changed: 3140321 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

internal/cbm/cbm.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,11 @@ var languageToC = map[lang.Language]C.CBMLanguage{
205205
lang.Meson: C.CBM_LANG_MESON,
206206
lang.GLSL: C.CBM_LANG_GLSL,
207207
lang.INI: C.CBM_LANG_INI,
208+
// Scientific/math languages
209+
lang.MATLAB: C.CBM_LANG_MATLAB,
210+
lang.Lean: C.CBM_LANG_LEAN,
211+
lang.FORM: C.CBM_LANG_FORM,
212+
lang.Magma: C.CBM_LANG_MAGMA,
208213
}
209214

210215
// ParseTimeoutMicros is the default per-file parse timeout (10 seconds).

internal/cbm/cbm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ typedef enum {
6969
CBM_LANG_MESON,
7070
CBM_LANG_GLSL,
7171
CBM_LANG_INI,
72+
// Scientific/math languages
73+
CBM_LANG_MATLAB,
74+
CBM_LANG_LEAN,
75+
CBM_LANG_FORM,
76+
CBM_LANG_MAGMA,
7277
CBM_LANG_COUNT
7378
} CBMLanguage;
7479

internal/cbm/extract_defs.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,36 @@ static TSNode resolve_func_name(TSNode node, CBMLanguage lang, const char* sourc
207207
return null_node;
208208
}
209209

210+
// MATLAB: function_definition — name via "name" field
211+
if (lang == CBM_LANG_MATLAB && strcmp(kind, "function_definition") == 0) {
212+
// MATLAB grammar: function [ret] = name(args)
213+
// The "name" field should be set; fallback to first identifier child
214+
if (!ts_node_is_null(name)) return name;
215+
return cbm_find_child_by_kind(node, "identifier");
216+
}
217+
218+
// Lean: def/theorem/instance/abbrev — name via declId field
219+
if (lang == CBM_LANG_LEAN) {
220+
// Lean grammar uses "declId" field for the name (6 chars)
221+
TSNode decl_id = ts_node_child_by_field_name(node, "declId", 6);
222+
if (!ts_node_is_null(decl_id)) {
223+
// declId contains an identifier
224+
TSNode id = cbm_find_child_by_kind(decl_id, "ident");
225+
if (!ts_node_is_null(id)) return id;
226+
// Fallback: first named child of declId
227+
if (ts_node_named_child_count(decl_id) > 0)
228+
return ts_node_named_child(decl_id, 0);
229+
return decl_id;
230+
}
231+
// Fallback: look for "name" field or first identifier
232+
if (!ts_node_is_null(name)) return name;
233+
return cbm_find_child_by_kind(node, "ident");
234+
}
235+
236+
// FORM: procedure_definition — name via "name" field (standard)
237+
// Magma: function/procedure/intrinsic_definition — name via "name" field (standard)
238+
// Both use standard "name" field which is handled above at line 33
239+
210240
// C/C++/CUDA/GLSL: function_definition — name is inside the declarator chain
211241
// C grammar: function_definition{declarator:function_declarator{declarator:identifier}}
212242
if ((lang == CBM_LANG_C || lang == CBM_LANG_CPP || lang == CBM_LANG_CUDA ||

internal/cbm/extract_imports.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,15 @@ void cbm_extract_imports(CBMExtractCtx* ctx) {
609609
case CBM_LANG_DART:
610610
parse_generic_imports(ctx, "import_declaration");
611611
break;
612+
case CBM_LANG_LEAN:
613+
parse_generic_imports(ctx, "import");
614+
break;
615+
case CBM_LANG_FORM:
616+
parse_generic_imports(ctx, "include_directive");
617+
break;
618+
case CBM_LANG_MAGMA:
619+
parse_generic_imports(ctx, "load_statement");
620+
break;
612621
default:
613622
break;
614623
}

internal/cbm/grammar_form.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// Vendored tree-sitter grammar: form
2+
#include "vendored/grammars/form/parser.c"

internal/cbm/grammar_lean.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Vendored tree-sitter grammar: lean
2+
#include "vendored/grammars/lean/parser.c"
3+
#include "vendored/grammars/lean/scanner.c"

internal/cbm/grammar_magma.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// Vendored tree-sitter grammar: magma
2+
#include "vendored/grammars/magma/parser.c"

internal/cbm/grammar_matlab.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Vendored tree-sitter grammar: matlab
2+
#include "vendored/grammars/matlab/parser.c"
3+
#include "vendored/grammars/matlab/scanner.c"

internal/cbm/helpers.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ bool cbm_is_test_file(const char* rel_path, CBMLanguage lang) {
202202
case CBM_LANG_C:
203203
return has_suffix(base, "_test.c") || has_suffix(base, "_test.cc") ||
204204
has_suffix(base, "_test.cpp") || has_prefix(base, "test_");
205+
case CBM_LANG_MATLAB:
206+
return has_prefix(base, "test_") || has_prefix(base, "Test");
205207
default:
206208
return false;
207209
}
@@ -284,6 +286,10 @@ static const char* func_kinds_zig[] = {"function_declaration","test_declaration"
284286
static const char* func_kinds_bash[] = {"function_definition",NULL};
285287
static const char* func_kinds_erlang[] = {"function_clause",NULL};
286288
static const char* func_kinds_csharp[] = {"method_declaration","constructor_declaration",NULL};
289+
static const char* func_kinds_matlab[] = {"function_definition",NULL};
290+
static const char* func_kinds_lean[] = {"def","theorem","instance","abbrev",NULL};
291+
static const char* func_kinds_form[] = {"procedure_definition",NULL};
292+
static const char* func_kinds_magma[] = {"function_definition","procedure_definition","intrinsic_definition",NULL};
287293
static const char* func_kinds_generic[] = {"function_declaration","function_definition","method_declaration","method_definition",NULL};
288294

289295
static const char** func_kinds_for_lang(CBMLanguage lang) {
@@ -309,6 +315,10 @@ static const char** func_kinds_for_lang(CBMLanguage lang) {
309315
case CBM_LANG_BASH: return func_kinds_bash;
310316
case CBM_LANG_ERLANG: return func_kinds_erlang;
311317
case CBM_LANG_CSHARP: return func_kinds_csharp;
318+
case CBM_LANG_MATLAB: return func_kinds_matlab;
319+
case CBM_LANG_LEAN: return func_kinds_lean;
320+
case CBM_LANG_FORM: return func_kinds_form;
321+
case CBM_LANG_MAGMA: return func_kinds_magma;
312322
default: return func_kinds_generic;
313323
}
314324
}
@@ -439,6 +449,9 @@ static const char* module_parents_config[] = {"document","table","table_array_el
439449
static const char* module_parents_hcl[] = {"config_file",NULL};
440450
static const char* module_parents_makefile[] = {"makefile",NULL};
441451
static const char* module_parents_commonlisp[] = {"source",NULL};
452+
static const char* module_parents_matlab[] = {"source_file",NULL};
453+
static const char* module_parents_form[] = {"source_file",NULL};
454+
static const char* module_parents_magma[] = {"source_file",NULL};
442455

443456
bool cbm_is_module_level(TSNode node, CBMLanguage lang) {
444457
TSNode parent = ts_node_parent(node);
@@ -519,6 +532,10 @@ bool cbm_is_module_level(TSNode node, CBMLanguage lang) {
519532
case CBM_LANG_R: parents = module_parents_php; break; // program
520533
case CBM_LANG_MAKEFILE: parents = module_parents_makefile; break;
521534
case CBM_LANG_COMMONLISP: parents = module_parents_commonlisp; break;
535+
case CBM_LANG_MATLAB: parents = module_parents_matlab; break;
536+
case CBM_LANG_LEAN: parents = module_parents_zig; break; // source_file
537+
case CBM_LANG_FORM: parents = module_parents_form; break;
538+
case CBM_LANG_MAGMA: parents = module_parents_magma; break;
522539
default: return false;
523540
}
524541
if (parents) {

internal/cbm/lang_specs.c

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ extern const TSLanguage* tree_sitter_svelte(void);
6262
extern const TSLanguage* tree_sitter_meson(void);
6363
extern const TSLanguage* tree_sitter_glsl(void);
6464
extern const TSLanguage* tree_sitter_ini(void);
65+
// Scientific/math languages
66+
extern const TSLanguage* tree_sitter_matlab(void);
67+
extern const TSLanguage* tree_sitter_lean(void);
68+
extern const TSLanguage* tree_sitter_form(void);
69+
extern const TSLanguage* tree_sitter_magma(void);
6570

6671
// -- Empty sentinel --
6772
static const char* empty_types[] = {NULL};
@@ -545,6 +550,37 @@ static const char* ini_module_types[] = {"document",NULL};
545550
static const char* ini_class_types[] = {"section",NULL};
546551
static const char* ini_var_types[] = {"setting",NULL};
547552

553+
// ==================== MATLAB ====================
554+
static const char* matlab_func_types[] = {"function_definition",NULL};
555+
static const char* matlab_class_types[] = {"class_definition",NULL};
556+
static const char* matlab_module_types[] = {"source_file",NULL};
557+
static const char* matlab_branch_types[] = {"if_statement","for_statement","while_statement","switch_statement","try_statement",NULL};
558+
static const char* matlab_var_types[] = {"assignment",NULL};
559+
560+
// ==================== LEAN ====================
561+
static const char* lean_func_types[] = {"def","theorem","instance","abbrev",NULL};
562+
static const char* lean_class_types[] = {"structure","class_inductive","inductive",NULL};
563+
static const char* lean_module_types[] = {"module",NULL};
564+
static const char* lean_import_types[] = {"import",NULL};
565+
static const char* lean_branch_types[] = {"if","match","do",NULL};
566+
567+
// ==================== FORM ====================
568+
static const char* form_func_types[] = {"procedure_definition",NULL};
569+
static const char* form_module_types[] = {"source_file",NULL};
570+
static const char* form_call_types[] = {"call_statement",NULL};
571+
static const char* form_import_types[] = {"include_directive",NULL};
572+
static const char* form_branch_types[] = {"if_statement","repeat_statement","do_loop",NULL};
573+
static const char* form_var_types[] = {"declaration_statement",NULL};
574+
static const char* form_assign_types[] = {"substitution_statement",NULL};
575+
576+
// ==================== MAGMA ====================
577+
static const char* magma_func_types[] = {"function_definition","procedure_definition","intrinsic_definition",NULL};
578+
static const char* magma_module_types[] = {"source_file",NULL};
579+
static const char* magma_call_types[] = {"call_expression",NULL};
580+
static const char* magma_import_types[] = {"load_statement",NULL};
581+
static const char* magma_branch_types[] = {"if_statement","for_statement","while_statement","repeat_statement","case_statement",NULL};
582+
static const char* magma_var_types[] = {"assignment_statement",NULL};
583+
548584
// ==================== NEW LANG ENV ACCESS ====================
549585
static const char* julia_env_funcs[] = {"ENV",NULL};
550586
static const char* nix_env_funcs[] = {"builtins.getEnv",NULL};
@@ -848,6 +884,26 @@ static const CBMLangSpec lang_specs[CBM_LANG_COUNT] = {
848884
{CBM_LANG_INI, empty_types, ini_class_types, empty_types, ini_module_types, empty_types,
849885
empty_types, empty_types, empty_types, ini_var_types, empty_types,
850886
empty_types, NULL, empty_types, NULL, NULL},
887+
888+
// CBM_LANG_MATLAB (definitions-only, no call graph due to A(1) ambiguity)
889+
{CBM_LANG_MATLAB, matlab_func_types, matlab_class_types, empty_types, matlab_module_types, empty_types,
890+
empty_types, empty_types, matlab_branch_types, matlab_var_types, matlab_var_types,
891+
empty_types, NULL, empty_types, NULL, NULL},
892+
893+
// CBM_LANG_LEAN (definitions-only, Lean 4 syntax is runtime-extensible)
894+
{CBM_LANG_LEAN, lean_func_types, lean_class_types, empty_types, lean_module_types, empty_types,
895+
lean_import_types, empty_types, lean_branch_types, empty_types, empty_types,
896+
empty_types, NULL, empty_types, NULL, NULL},
897+
898+
// CBM_LANG_FORM
899+
{CBM_LANG_FORM, form_func_types, empty_types, empty_types, form_module_types, form_call_types,
900+
form_import_types, empty_types, form_branch_types, form_var_types, form_assign_types,
901+
empty_types, NULL, empty_types, NULL, NULL},
902+
903+
// CBM_LANG_MAGMA
904+
{CBM_LANG_MAGMA, magma_func_types, empty_types, empty_types, magma_module_types, magma_call_types,
905+
magma_import_types, empty_types, magma_branch_types, magma_var_types, magma_var_types,
906+
empty_types, NULL, empty_types, NULL, NULL},
851907
};
852908

853909
const CBMLangSpec* cbm_lang_spec(CBMLanguage lang) {
@@ -916,6 +972,10 @@ const TSLanguage* cbm_ts_language(CBMLanguage lang) {
916972
case CBM_LANG_MESON: return tree_sitter_meson();
917973
case CBM_LANG_GLSL: return tree_sitter_glsl();
918974
case CBM_LANG_INI: return tree_sitter_ini();
975+
case CBM_LANG_MATLAB: return tree_sitter_matlab();
976+
case CBM_LANG_LEAN: return tree_sitter_lean();
977+
case CBM_LANG_FORM: return tree_sitter_form();
978+
case CBM_LANG_MAGMA: return tree_sitter_magma();
919979
default: return NULL;
920980
}
921981
}

0 commit comments

Comments
 (0)