Skip to content

Commit a0715ff

Browse files
DeusDataKoolerx
andcommitted
Add JS/TS IMPORTS resolution, Channel schema, lint cleanup
Features: - Relative import resolution for JS/TS/Python/Ruby (./foo, ../bar, leading dots) via cbm_pipeline_resolve_relative_import in fqn.c - Generic Channel detection: Socket.IO emit/on, EventEmitter patterns produce Channel nodes with EMITS/LISTENS_ON edges and transport property (extract_channels.c) - Constant resolution trick: `const EVENT = "foo"; emit(EVENT)` resolves the channel name through a per-file constant table Lint fixes (337 errors across 16 files): - Named constants replacing all magic numbers (enums/defines) - Cognitive complexity reduced via helper extraction (table-driven sqlite_writer, pipeline predump passes, graph_buffer dump phases) - SQLITE_TRANSIENT int-to-ptr workaround in store.c + mcp.c - clang-format + cppcheck clean Co-Authored-By: Koolerx <tommy@koolerx.com>
1 parent 647af0b commit a0715ff

28 files changed

+3505
-1988
lines changed

Makefile.cbm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ EXTRACTION_SRCS = \
122122
$(CBM_DIR)/extract_type_refs.c \
123123
$(CBM_DIR)/extract_type_assigns.c \
124124
$(CBM_DIR)/extract_env_accesses.c \
125+
$(CBM_DIR)/extract_channels.c \
125126
$(CBM_DIR)/extract_k8s.c \
126127
$(CBM_DIR)/helpers.c \
127128
$(CBM_DIR)/lang_specs.c \

internal/cbm/cbm.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ void cbm_resolvedcall_push(CBMResolvedCallArray *arr, CBMArena *a, CBMResolvedCa
152152
arr->items[arr->count++] = rc;
153153
}
154154

155+
void cbm_channels_push(CBMChannelArray *arr, CBMArena *a, CBMChannel ch) {
156+
GROW_ARRAY(arr, a);
157+
arr->items[arr->count++] = ch;
158+
}
159+
155160
// --- String input reader (for parse_with_options) ---
156161

157162
typedef struct {
@@ -336,6 +341,9 @@ CBMFileResult *cbm_extract_file(const char *source, int source_len, CBMLanguage
336341
cbm_extract_imports(&ctx);
337342
cbm_extract_unified(&ctx);
338343

344+
// Channel detection (Socket.IO / EventEmitter) — JS/TS only.
345+
cbm_extract_channels(&ctx);
346+
339347
// K8s / Kustomize semantic pass (additional structured extraction for YAML-based infra files).
340348
if (ctx.language == CBM_LANG_KUSTOMIZE || ctx.language == CBM_LANG_K8S) {
341349
cbm_extract_k8s(&ctx);

internal/cbm/cbm.h

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ typedef struct {
110110
bool is_test;
111111
bool is_entry_point;
112112
const char *structural_profile; // AST structural profile (arena-allocated) or NULL
113-
const char *body_tokens; // space-separated raw identifier tokens from body (arena) or NULL
113+
const char *body_tokens; // space-separated raw identifier tokens from body (arena) or NULL
114114
} CBMDefinition;
115115

116116
/* Argument captured from a call expression */
@@ -192,6 +192,24 @@ typedef struct {
192192
const char *broker; // "pubsub", "cloud_tasks", "cloud_scheduler", "sqs", "kafka"
193193
} CBMInfraBinding;
194194

195+
/* Pub/sub channel participation. One record per emit() or on()/addListener()
196+
* call detected in source — the receiver (e.g. Socket.IO client, EventEmitter
197+
* instance) is intentionally NOT identified; matching is by channel_name
198+
* across files, which captures the common pattern of one logical bus per
199+
* service. Transport disambiguates Socket.IO vs EventEmitter vs future
200+
* detectors (Kafka, Cloud Pub/Sub, etc.). */
201+
typedef enum {
202+
CBM_CHANNEL_EMIT = 0,
203+
CBM_CHANNEL_LISTEN = 1,
204+
} CBMChannelDirection;
205+
206+
typedef struct {
207+
const char *channel_name; // literal channel name (e.g. "user.created")
208+
const char *transport; // "socketio", "event_emitter", ...
209+
const char *enclosing_func_qn; // QN of the function containing the emit/on call
210+
CBMChannelDirection direction;
211+
} CBMChannel;
212+
195213
// Rust: impl Trait for Struct
196214
typedef struct {
197215
const char *trait_name; // trait name (raw text)
@@ -286,6 +304,12 @@ typedef struct {
286304
int cap;
287305
} CBMImplTraitArray;
288306

307+
typedef struct {
308+
CBMChannel *items;
309+
int count;
310+
int cap;
311+
} CBMChannelArray;
312+
289313
// Full extraction result for one file.
290314
typedef struct {
291315
CBMArena arena; // owns all string memory
@@ -303,6 +327,7 @@ typedef struct {
303327
CBMResolvedCallArray resolved_calls; // LSP-resolved calls (high confidence)
304328
CBMStringRefArray string_refs; // URL/config string literals from AST
305329
CBMInfraBindingArray infra_bindings; // topic→URL pairs from IaC configs
330+
CBMChannelArray channels; // Socket.IO / EventEmitter pub/sub participation
306331

307332
const char *module_qn; // module qualified name
308333
const char **exports; // NULL-terminated (NULL if none)
@@ -421,6 +446,7 @@ void cbm_stringref_push(CBMStringRefArray *arr, CBMArena *a, CBMStringRef sr);
421446
void cbm_infrabinding_push(CBMInfraBindingArray *arr, CBMArena *a, CBMInfraBinding ib);
422447
void cbm_impltrait_push(CBMImplTraitArray *arr, CBMArena *a, CBMImplTrait it);
423448
void cbm_resolvedcall_push(CBMResolvedCallArray *arr, CBMArena *a, CBMResolvedCall rc);
449+
void cbm_channels_push(CBMChannelArray *arr, CBMArena *a, CBMChannel ch);
424450

425451
// --- Sub-extractor entry points ---
426452

@@ -432,6 +458,7 @@ void cbm_extract_semantic(CBMExtractCtx *ctx);
432458
void cbm_extract_type_refs(CBMExtractCtx *ctx);
433459
void cbm_extract_env_accesses(CBMExtractCtx *ctx);
434460
void cbm_extract_type_assigns(CBMExtractCtx *ctx);
461+
void cbm_extract_channels(CBMExtractCtx *ctx);
435462

436463
// Single-pass unified extraction (replaces the 7 calls above except defs+imports).
437464
void cbm_extract_unified(CBMExtractCtx *ctx);

internal/cbm/extract_channels.c

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
/*
2+
* extract_channels.c — Pub/sub channel participation extractor.
3+
*
4+
* Detects Socket.IO and EventEmitter emit / on / addListener call patterns in
5+
* JS/TS/TSX source and records each participation as a CBMChannel record.
6+
* Transport is stored on the record ("socketio", "event_emitter") so later
7+
* detectors for Kafka, Cloud Pub/Sub, etc. can share the same schema without
8+
* changing the edge types.
9+
*
10+
* String-constant resolution: when the channel name argument is a plain
11+
* identifier, we perform a single-pass local scan of the module body to
12+
* resolve `const EVENT = "foo"` style bindings. Template literals and
13+
* config-driven names stay unresolved (acceptable — those require real
14+
* data-flow analysis).
15+
*/
16+
#include "cbm.h"
17+
#include "arena.h"
18+
#include "helpers.h"
19+
#include "foundation/constants.h"
20+
#include "tree_sitter/api.h"
21+
#include <stdint.h>
22+
#include <string.h>
23+
24+
enum {
25+
CHAN_CONST_CAP = 256, /* max tracked identifiers per file */
26+
CHAN_STACK_CAP = 4096, /* tree-walk stack */
27+
CHAN_IDENT_MAX = 128, /* max identifier length tracked */
28+
};
29+
30+
typedef struct {
31+
const char *name; /* borrowed — points into arena */
32+
const char *value; /* borrowed — points into arena */
33+
} chan_const_t;
34+
35+
typedef struct {
36+
chan_const_t items[CHAN_CONST_CAP];
37+
int count;
38+
} chan_const_table_t;
39+
40+
/* ── String literal helpers ──────────────────────────────────────── */
41+
42+
static const char *unquote_string(CBMArena *a, const char *s) {
43+
if (!s) {
44+
return NULL;
45+
}
46+
size_t len = strlen(s);
47+
if (len < CBM_QUOTE_PAIR) {
48+
return NULL;
49+
}
50+
char first = s[0];
51+
char last = s[len - CBM_QUOTE_OFFSET];
52+
if ((first == '"' && last == '"') || (first == '\'' && last == '\'') ||
53+
(first == '`' && last == '`')) {
54+
return cbm_arena_strndup(a, s + CBM_QUOTE_OFFSET, len - CBM_QUOTE_PAIR);
55+
}
56+
return NULL;
57+
}
58+
59+
/* Extract a literal channel name from an argument node. Returns NULL if the
60+
* argument is not a plain string literal (caller can then try identifier
61+
* resolution via the constant table). */
62+
static const char *literal_from_arg(CBMExtractCtx *ctx, TSNode arg) {
63+
const char *kind = ts_node_type(arg);
64+
if (strcmp(kind, "string") != 0 && strcmp(kind, "string_literal") != 0) {
65+
return NULL;
66+
}
67+
char *text = cbm_node_text(ctx->arena, arg, ctx->source);
68+
return unquote_string(ctx->arena, text);
69+
}
70+
71+
/* ── Constant resolution table ──────────────────────────────────── */
72+
73+
/* Walk the whole tree once and collect `const IDENT = "value"` bindings so
74+
* later passes can resolve bare-identifier channel arguments. Only scalar
75+
* string literals are tracked — template literals and expressions are left
76+
* unresolved. This is a flat lookup; scope boundaries are ignored (a single
77+
* const table per file is sufficient for the common Socket.IO pattern). */
78+
static void scan_string_consts(CBMExtractCtx *ctx, chan_const_table_t *tbl) {
79+
TSNode stack[CHAN_STACK_CAP];
80+
int top = 0;
81+
stack[top++] = ctx->root;
82+
83+
while (top > 0 && tbl->count < CHAN_CONST_CAP) {
84+
TSNode node = stack[--top];
85+
const char *kind = ts_node_type(node);
86+
87+
/* `variable_declarator` is the TS/JS form of `IDENT = value`.
88+
* The `name` field holds the identifier, `value` holds the RHS. */
89+
if (strcmp(kind, "variable_declarator") == 0) {
90+
TSNode name_node = ts_node_child_by_field_name(node, TS_FIELD("name"));
91+
TSNode value_node = ts_node_child_by_field_name(node, TS_FIELD("value"));
92+
if (!ts_node_is_null(name_node) && !ts_node_is_null(value_node)) {
93+
const char *nk = ts_node_type(name_node);
94+
const char *vk = ts_node_type(value_node);
95+
if (strcmp(nk, "identifier") == 0 &&
96+
(strcmp(vk, "string") == 0 || strcmp(vk, "string_literal") == 0)) {
97+
char *name_text = cbm_node_text(ctx->arena, name_node, ctx->source);
98+
char *value_text = cbm_node_text(ctx->arena, value_node, ctx->source);
99+
const char *unq = unquote_string(ctx->arena, value_text);
100+
if (name_text && unq) {
101+
tbl->items[tbl->count].name = name_text;
102+
tbl->items[tbl->count].value = unq;
103+
tbl->count++;
104+
}
105+
}
106+
}
107+
}
108+
109+
uint32_t count = ts_node_child_count(node);
110+
for (int i = (int)count - SKIP_ONE; i >= 0 && top < CHAN_STACK_CAP; i--) {
111+
stack[top++] = ts_node_child(node, (uint32_t)i);
112+
}
113+
}
114+
}
115+
116+
/* Resolve an identifier against the constant table. Returns NULL on miss. */
117+
static const char *resolve_identifier(const chan_const_table_t *tbl, const char *name) {
118+
if (!name) {
119+
return NULL;
120+
}
121+
for (int i = 0; i < tbl->count; i++) {
122+
if (tbl->items[i].name && strcmp(tbl->items[i].name, name) == 0) {
123+
return tbl->items[i].value;
124+
}
125+
}
126+
return NULL;
127+
}
128+
129+
/* ── Enclosing function detection ───────────────────────────────── */
130+
131+
/* Walk up the parent chain to find the nearest function-like ancestor and
132+
* build a best-effort qualified name for it. */
133+
static const char *enclosing_function_qn(CBMExtractCtx *ctx, TSNode node) {
134+
TSNode parent = ts_node_parent(node);
135+
while (!ts_node_is_null(parent)) {
136+
const char *pk = ts_node_type(parent);
137+
if (strcmp(pk, "function_declaration") == 0 || strcmp(pk, "method_definition") == 0 ||
138+
strcmp(pk, "arrow_function") == 0 || strcmp(pk, "function_expression") == 0 ||
139+
strcmp(pk, "function") == 0 || strcmp(pk, "method_signature") == 0) {
140+
TSNode name_node = ts_node_child_by_field_name(parent, TS_FIELD("name"));
141+
if (!ts_node_is_null(name_node)) {
142+
char *name = cbm_node_text(ctx->arena, name_node, ctx->source);
143+
if (name && name[0]) {
144+
return name;
145+
}
146+
}
147+
return NULL;
148+
}
149+
parent = ts_node_parent(parent);
150+
}
151+
return NULL;
152+
}
153+
154+
/* ── Emit / listen detection ────────────────────────────────────── */
155+
156+
static bool is_emit_method(const char *name) {
157+
return name && strcmp(name, "emit") == 0;
158+
}
159+
160+
static bool is_listen_method(const char *name) {
161+
return name && (strcmp(name, "on") == 0 || strcmp(name, "addListener") == 0 ||
162+
strcmp(name, "once") == 0);
163+
}
164+
165+
/* Match a transport receiver: plain identifier `socket`/`io`/`emitter` or a
166+
* member expression like `this.io` / `client.socket`. Returns "socketio" or
167+
* "event_emitter" based on a name heuristic, NULL if the receiver is unknown
168+
* (which means we skip — we don't want to mistake any .emit()/.on() call
169+
* for a channel). */
170+
static const char *classify_receiver(CBMExtractCtx *ctx, TSNode object_node) {
171+
char *text = cbm_node_text(ctx->arena, object_node, ctx->source);
172+
if (!text) {
173+
return NULL;
174+
}
175+
/* Strip leading `this.`, `self.`, or class-instance prefixes for the match. */
176+
const char *tail = text;
177+
const char *dot = strrchr(tail, '.');
178+
if (dot) {
179+
tail = dot + SKIP_ONE;
180+
}
181+
/* Common Socket.IO variable names. */
182+
if (strcmp(tail, "socket") == 0 || strcmp(tail, "io") == 0 || strcmp(tail, "ws") == 0 ||
183+
strcmp(tail, "client") == 0 || strcmp(tail, "server") == 0) {
184+
return "socketio";
185+
}
186+
/* Node.js EventEmitter convention. */
187+
if (strcmp(tail, "emitter") == 0 || strcmp(tail, "eventEmitter") == 0 ||
188+
strcmp(tail, "events") == 0 || strcmp(tail, "bus") == 0 || strcmp(tail, "eventBus") == 0 ||
189+
strcmp(tail, "pubsub") == 0) {
190+
return "event_emitter";
191+
}
192+
return NULL;
193+
}
194+
195+
/* Process a single call_expression node if it looks like a channel call. */
196+
static void process_channel_call(CBMExtractCtx *ctx, TSNode call,
197+
const chan_const_table_t *consts) {
198+
/* call_expression { function: member_expression { object, property }, arguments } */
199+
TSNode func = ts_node_child_by_field_name(call, TS_FIELD("function"));
200+
if (ts_node_is_null(func) || strcmp(ts_node_type(func), "member_expression") != 0) {
201+
return;
202+
}
203+
TSNode object = ts_node_child_by_field_name(func, TS_FIELD("object"));
204+
TSNode property = ts_node_child_by_field_name(func, TS_FIELD("property"));
205+
if (ts_node_is_null(object) || ts_node_is_null(property)) {
206+
return;
207+
}
208+
209+
char *method = cbm_node_text(ctx->arena, property, ctx->source);
210+
CBMChannelDirection direction;
211+
if (is_emit_method(method)) {
212+
direction = CBM_CHANNEL_EMIT;
213+
} else if (is_listen_method(method)) {
214+
direction = CBM_CHANNEL_LISTEN;
215+
} else {
216+
return;
217+
}
218+
219+
const char *transport = classify_receiver(ctx, object);
220+
if (!transport) {
221+
return;
222+
}
223+
224+
/* First positional argument is the channel name. */
225+
TSNode args = ts_node_child_by_field_name(call, TS_FIELD("arguments"));
226+
if (ts_node_is_null(args)) {
227+
return;
228+
}
229+
uint32_t arg_count = ts_node_named_child_count(args);
230+
if (arg_count == 0) {
231+
return;
232+
}
233+
TSNode first = ts_node_named_child(args, 0);
234+
235+
const char *channel_name = literal_from_arg(ctx, first);
236+
if (!channel_name) {
237+
/* Try identifier resolution via the constant table. */
238+
const char *kind = ts_node_type(first);
239+
if (strcmp(kind, "identifier") == 0) {
240+
char *ident = cbm_node_text(ctx->arena, first, ctx->source);
241+
channel_name = resolve_identifier(consts, ident);
242+
}
243+
}
244+
if (!channel_name) {
245+
return; /* template literal, member access, expression — skip */
246+
}
247+
248+
CBMChannel ch = {
249+
.channel_name = channel_name,
250+
.transport = transport,
251+
.enclosing_func_qn = enclosing_function_qn(ctx, call),
252+
.direction = direction,
253+
};
254+
cbm_channels_push(&ctx->result->channels, ctx->arena, ch);
255+
}
256+
257+
/* ── Entry point ────────────────────────────────────────────────── */
258+
259+
void cbm_extract_channels(CBMExtractCtx *ctx) {
260+
/* Only JS/TS variants — Socket.IO and EventEmitter are Node.js ecosystem. */
261+
if (ctx->language != CBM_LANG_JAVASCRIPT && ctx->language != CBM_LANG_TYPESCRIPT &&
262+
ctx->language != CBM_LANG_TSX) {
263+
return;
264+
}
265+
266+
chan_const_table_t consts = {0};
267+
scan_string_consts(ctx, &consts);
268+
269+
/* Second pass: walk the tree looking for call_expression nodes. */
270+
TSNode stack[CHAN_STACK_CAP];
271+
int top = 0;
272+
stack[top++] = ctx->root;
273+
274+
while (top > 0) {
275+
TSNode node = stack[--top];
276+
if (strcmp(ts_node_type(node), "call_expression") == 0) {
277+
process_channel_call(ctx, node, &consts);
278+
}
279+
uint32_t count = ts_node_child_count(node);
280+
for (int i = (int)count - SKIP_ONE; i >= 0 && top < CHAN_STACK_CAP; i--) {
281+
stack[top++] = ts_node_child(node, (uint32_t)i);
282+
}
283+
}
284+
}

0 commit comments

Comments
 (0)