|
| 1 | +/* |
| 2 | + * extract_channels.c — Pub/sub channel participation extractor. |
| 3 | + * |
| 4 | + * Detects Socket.IO and EventEmitter emit / on / addListener call patterns in |
| 5 | + * JS/TS/TSX source and records each participation as a CBMChannel record. |
| 6 | + * Transport is stored on the record ("socketio", "event_emitter") so later |
| 7 | + * detectors for Kafka, Cloud Pub/Sub, etc. can share the same schema without |
| 8 | + * changing the edge types. |
| 9 | + * |
| 10 | + * String-constant resolution: when the channel name argument is a plain |
| 11 | + * identifier, we perform a single-pass local scan of the module body to |
| 12 | + * resolve `const EVENT = "foo"` style bindings. Template literals and |
| 13 | + * config-driven names stay unresolved (acceptable — those require real |
| 14 | + * data-flow analysis). |
| 15 | + */ |
| 16 | +#include "cbm.h" |
| 17 | +#include "arena.h" |
| 18 | +#include "helpers.h" |
| 19 | +#include "foundation/constants.h" |
| 20 | +#include "tree_sitter/api.h" |
| 21 | +#include <stdint.h> |
| 22 | +#include <string.h> |
| 23 | + |
| 24 | +enum { |
| 25 | + CHAN_CONST_CAP = 256, /* max tracked identifiers per file */ |
| 26 | + CHAN_STACK_CAP = 4096, /* tree-walk stack */ |
| 27 | + CHAN_IDENT_MAX = 128, /* max identifier length tracked */ |
| 28 | +}; |
| 29 | + |
| 30 | +typedef struct { |
| 31 | + const char *name; /* borrowed — points into arena */ |
| 32 | + const char *value; /* borrowed — points into arena */ |
| 33 | +} chan_const_t; |
| 34 | + |
| 35 | +typedef struct { |
| 36 | + chan_const_t items[CHAN_CONST_CAP]; |
| 37 | + int count; |
| 38 | +} chan_const_table_t; |
| 39 | + |
| 40 | +/* ── String literal helpers ──────────────────────────────────────── */ |
| 41 | + |
| 42 | +static const char *unquote_string(CBMArena *a, const char *s) { |
| 43 | + if (!s) { |
| 44 | + return NULL; |
| 45 | + } |
| 46 | + size_t len = strlen(s); |
| 47 | + if (len < CBM_QUOTE_PAIR) { |
| 48 | + return NULL; |
| 49 | + } |
| 50 | + char first = s[0]; |
| 51 | + char last = s[len - CBM_QUOTE_OFFSET]; |
| 52 | + if ((first == '"' && last == '"') || (first == '\'' && last == '\'') || |
| 53 | + (first == '`' && last == '`')) { |
| 54 | + return cbm_arena_strndup(a, s + CBM_QUOTE_OFFSET, len - CBM_QUOTE_PAIR); |
| 55 | + } |
| 56 | + return NULL; |
| 57 | +} |
| 58 | + |
| 59 | +/* Extract a literal channel name from an argument node. Returns NULL if the |
| 60 | + * argument is not a plain string literal (caller can then try identifier |
| 61 | + * resolution via the constant table). */ |
| 62 | +static const char *literal_from_arg(CBMExtractCtx *ctx, TSNode arg) { |
| 63 | + const char *kind = ts_node_type(arg); |
| 64 | + if (strcmp(kind, "string") != 0 && strcmp(kind, "string_literal") != 0) { |
| 65 | + return NULL; |
| 66 | + } |
| 67 | + char *text = cbm_node_text(ctx->arena, arg, ctx->source); |
| 68 | + return unquote_string(ctx->arena, text); |
| 69 | +} |
| 70 | + |
| 71 | +/* ── Constant resolution table ──────────────────────────────────── */ |
| 72 | + |
| 73 | +/* Walk the whole tree once and collect `const IDENT = "value"` bindings so |
| 74 | + * later passes can resolve bare-identifier channel arguments. Only scalar |
| 75 | + * string literals are tracked — template literals and expressions are left |
| 76 | + * unresolved. This is a flat lookup; scope boundaries are ignored (a single |
| 77 | + * const table per file is sufficient for the common Socket.IO pattern). */ |
| 78 | +static void scan_string_consts(CBMExtractCtx *ctx, chan_const_table_t *tbl) { |
| 79 | + TSNode stack[CHAN_STACK_CAP]; |
| 80 | + int top = 0; |
| 81 | + stack[top++] = ctx->root; |
| 82 | + |
| 83 | + while (top > 0 && tbl->count < CHAN_CONST_CAP) { |
| 84 | + TSNode node = stack[--top]; |
| 85 | + const char *kind = ts_node_type(node); |
| 86 | + |
| 87 | + /* `variable_declarator` is the TS/JS form of `IDENT = value`. |
| 88 | + * The `name` field holds the identifier, `value` holds the RHS. */ |
| 89 | + if (strcmp(kind, "variable_declarator") == 0) { |
| 90 | + TSNode name_node = ts_node_child_by_field_name(node, TS_FIELD("name")); |
| 91 | + TSNode value_node = ts_node_child_by_field_name(node, TS_FIELD("value")); |
| 92 | + if (!ts_node_is_null(name_node) && !ts_node_is_null(value_node)) { |
| 93 | + const char *nk = ts_node_type(name_node); |
| 94 | + const char *vk = ts_node_type(value_node); |
| 95 | + if (strcmp(nk, "identifier") == 0 && |
| 96 | + (strcmp(vk, "string") == 0 || strcmp(vk, "string_literal") == 0)) { |
| 97 | + char *name_text = cbm_node_text(ctx->arena, name_node, ctx->source); |
| 98 | + char *value_text = cbm_node_text(ctx->arena, value_node, ctx->source); |
| 99 | + const char *unq = unquote_string(ctx->arena, value_text); |
| 100 | + if (name_text && unq) { |
| 101 | + tbl->items[tbl->count].name = name_text; |
| 102 | + tbl->items[tbl->count].value = unq; |
| 103 | + tbl->count++; |
| 104 | + } |
| 105 | + } |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + uint32_t count = ts_node_child_count(node); |
| 110 | + for (int i = (int)count - SKIP_ONE; i >= 0 && top < CHAN_STACK_CAP; i--) { |
| 111 | + stack[top++] = ts_node_child(node, (uint32_t)i); |
| 112 | + } |
| 113 | + } |
| 114 | +} |
| 115 | + |
| 116 | +/* Resolve an identifier against the constant table. Returns NULL on miss. */ |
| 117 | +static const char *resolve_identifier(const chan_const_table_t *tbl, const char *name) { |
| 118 | + if (!name) { |
| 119 | + return NULL; |
| 120 | + } |
| 121 | + for (int i = 0; i < tbl->count; i++) { |
| 122 | + if (tbl->items[i].name && strcmp(tbl->items[i].name, name) == 0) { |
| 123 | + return tbl->items[i].value; |
| 124 | + } |
| 125 | + } |
| 126 | + return NULL; |
| 127 | +} |
| 128 | + |
| 129 | +/* ── Enclosing function detection ───────────────────────────────── */ |
| 130 | + |
| 131 | +/* Walk up the parent chain to find the nearest function-like ancestor and |
| 132 | + * build a best-effort qualified name for it. */ |
| 133 | +static const char *enclosing_function_qn(CBMExtractCtx *ctx, TSNode node) { |
| 134 | + TSNode parent = ts_node_parent(node); |
| 135 | + while (!ts_node_is_null(parent)) { |
| 136 | + const char *pk = ts_node_type(parent); |
| 137 | + if (strcmp(pk, "function_declaration") == 0 || strcmp(pk, "method_definition") == 0 || |
| 138 | + strcmp(pk, "arrow_function") == 0 || strcmp(pk, "function_expression") == 0 || |
| 139 | + strcmp(pk, "function") == 0 || strcmp(pk, "method_signature") == 0) { |
| 140 | + TSNode name_node = ts_node_child_by_field_name(parent, TS_FIELD("name")); |
| 141 | + if (!ts_node_is_null(name_node)) { |
| 142 | + char *name = cbm_node_text(ctx->arena, name_node, ctx->source); |
| 143 | + if (name && name[0]) { |
| 144 | + return name; |
| 145 | + } |
| 146 | + } |
| 147 | + return NULL; |
| 148 | + } |
| 149 | + parent = ts_node_parent(parent); |
| 150 | + } |
| 151 | + return NULL; |
| 152 | +} |
| 153 | + |
| 154 | +/* ── Emit / listen detection ────────────────────────────────────── */ |
| 155 | + |
| 156 | +static bool is_emit_method(const char *name) { |
| 157 | + return name && strcmp(name, "emit") == 0; |
| 158 | +} |
| 159 | + |
| 160 | +static bool is_listen_method(const char *name) { |
| 161 | + return name && (strcmp(name, "on") == 0 || strcmp(name, "addListener") == 0 || |
| 162 | + strcmp(name, "once") == 0); |
| 163 | +} |
| 164 | + |
| 165 | +/* Match a transport receiver: plain identifier `socket`/`io`/`emitter` or a |
| 166 | + * member expression like `this.io` / `client.socket`. Returns "socketio" or |
| 167 | + * "event_emitter" based on a name heuristic, NULL if the receiver is unknown |
| 168 | + * (which means we skip — we don't want to mistake any .emit()/.on() call |
| 169 | + * for a channel). */ |
| 170 | +static const char *classify_receiver(CBMExtractCtx *ctx, TSNode object_node) { |
| 171 | + char *text = cbm_node_text(ctx->arena, object_node, ctx->source); |
| 172 | + if (!text) { |
| 173 | + return NULL; |
| 174 | + } |
| 175 | + /* Strip leading `this.`, `self.`, or class-instance prefixes for the match. */ |
| 176 | + const char *tail = text; |
| 177 | + const char *dot = strrchr(tail, '.'); |
| 178 | + if (dot) { |
| 179 | + tail = dot + SKIP_ONE; |
| 180 | + } |
| 181 | + /* Common Socket.IO variable names. */ |
| 182 | + if (strcmp(tail, "socket") == 0 || strcmp(tail, "io") == 0 || strcmp(tail, "ws") == 0 || |
| 183 | + strcmp(tail, "client") == 0 || strcmp(tail, "server") == 0) { |
| 184 | + return "socketio"; |
| 185 | + } |
| 186 | + /* Node.js EventEmitter convention. */ |
| 187 | + if (strcmp(tail, "emitter") == 0 || strcmp(tail, "eventEmitter") == 0 || |
| 188 | + strcmp(tail, "events") == 0 || strcmp(tail, "bus") == 0 || strcmp(tail, "eventBus") == 0 || |
| 189 | + strcmp(tail, "pubsub") == 0) { |
| 190 | + return "event_emitter"; |
| 191 | + } |
| 192 | + return NULL; |
| 193 | +} |
| 194 | + |
| 195 | +/* Process a single call_expression node if it looks like a channel call. */ |
| 196 | +static void process_channel_call(CBMExtractCtx *ctx, TSNode call, |
| 197 | + const chan_const_table_t *consts) { |
| 198 | + /* call_expression { function: member_expression { object, property }, arguments } */ |
| 199 | + TSNode func = ts_node_child_by_field_name(call, TS_FIELD("function")); |
| 200 | + if (ts_node_is_null(func) || strcmp(ts_node_type(func), "member_expression") != 0) { |
| 201 | + return; |
| 202 | + } |
| 203 | + TSNode object = ts_node_child_by_field_name(func, TS_FIELD("object")); |
| 204 | + TSNode property = ts_node_child_by_field_name(func, TS_FIELD("property")); |
| 205 | + if (ts_node_is_null(object) || ts_node_is_null(property)) { |
| 206 | + return; |
| 207 | + } |
| 208 | + |
| 209 | + char *method = cbm_node_text(ctx->arena, property, ctx->source); |
| 210 | + CBMChannelDirection direction; |
| 211 | + if (is_emit_method(method)) { |
| 212 | + direction = CBM_CHANNEL_EMIT; |
| 213 | + } else if (is_listen_method(method)) { |
| 214 | + direction = CBM_CHANNEL_LISTEN; |
| 215 | + } else { |
| 216 | + return; |
| 217 | + } |
| 218 | + |
| 219 | + const char *transport = classify_receiver(ctx, object); |
| 220 | + if (!transport) { |
| 221 | + return; |
| 222 | + } |
| 223 | + |
| 224 | + /* First positional argument is the channel name. */ |
| 225 | + TSNode args = ts_node_child_by_field_name(call, TS_FIELD("arguments")); |
| 226 | + if (ts_node_is_null(args)) { |
| 227 | + return; |
| 228 | + } |
| 229 | + uint32_t arg_count = ts_node_named_child_count(args); |
| 230 | + if (arg_count == 0) { |
| 231 | + return; |
| 232 | + } |
| 233 | + TSNode first = ts_node_named_child(args, 0); |
| 234 | + |
| 235 | + const char *channel_name = literal_from_arg(ctx, first); |
| 236 | + if (!channel_name) { |
| 237 | + /* Try identifier resolution via the constant table. */ |
| 238 | + const char *kind = ts_node_type(first); |
| 239 | + if (strcmp(kind, "identifier") == 0) { |
| 240 | + char *ident = cbm_node_text(ctx->arena, first, ctx->source); |
| 241 | + channel_name = resolve_identifier(consts, ident); |
| 242 | + } |
| 243 | + } |
| 244 | + if (!channel_name) { |
| 245 | + return; /* template literal, member access, expression — skip */ |
| 246 | + } |
| 247 | + |
| 248 | + CBMChannel ch = { |
| 249 | + .channel_name = channel_name, |
| 250 | + .transport = transport, |
| 251 | + .enclosing_func_qn = enclosing_function_qn(ctx, call), |
| 252 | + .direction = direction, |
| 253 | + }; |
| 254 | + cbm_channels_push(&ctx->result->channels, ctx->arena, ch); |
| 255 | +} |
| 256 | + |
| 257 | +/* ── Entry point ────────────────────────────────────────────────── */ |
| 258 | + |
| 259 | +void cbm_extract_channels(CBMExtractCtx *ctx) { |
| 260 | + /* Only JS/TS variants — Socket.IO and EventEmitter are Node.js ecosystem. */ |
| 261 | + if (ctx->language != CBM_LANG_JAVASCRIPT && ctx->language != CBM_LANG_TYPESCRIPT && |
| 262 | + ctx->language != CBM_LANG_TSX) { |
| 263 | + return; |
| 264 | + } |
| 265 | + |
| 266 | + chan_const_table_t consts = {0}; |
| 267 | + scan_string_consts(ctx, &consts); |
| 268 | + |
| 269 | + /* Second pass: walk the tree looking for call_expression nodes. */ |
| 270 | + TSNode stack[CHAN_STACK_CAP]; |
| 271 | + int top = 0; |
| 272 | + stack[top++] = ctx->root; |
| 273 | + |
| 274 | + while (top > 0) { |
| 275 | + TSNode node = stack[--top]; |
| 276 | + if (strcmp(ts_node_type(node), "call_expression") == 0) { |
| 277 | + process_channel_call(ctx, node, &consts); |
| 278 | + } |
| 279 | + uint32_t count = ts_node_child_count(node); |
| 280 | + for (int i = (int)count - SKIP_ONE; i >= 0 && top < CHAN_STACK_CAP; i--) { |
| 281 | + stack[top++] = ts_node_child(node, (uint32_t)i); |
| 282 | + } |
| 283 | + } |
| 284 | +} |
0 commit comments