Skip to content

Commit f685192

Browse files
committed
Improved mdx support
1 parent 46f18d1 commit f685192

4 files changed

Lines changed: 506 additions & 5 deletions

File tree

src/dbmem-parser.c

Lines changed: 307 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "dbmem-utils.h"
2222
#include "md4c.h"
2323

24+
#include <ctype.h>
2425
#include <stdio.h>
2526
#include <string.h>
2627

@@ -66,10 +67,19 @@ typedef struct {
6667
size_t wp; // Write position in buffer
6768
const char *line_end; // End of current line
6869
bool skip_html; // Whether to skip HTML tags
70+
bool mdx_mode; // Whether to strip MDX-specific syntax
6971
int in_html_tag; // Currently inside multi-line HTML tag
7072
int in_fenced_code; // Currently inside fenced code block
7173
char fence_char; // Fence character (` or ~)
7274
int fence_width; // Number of fence characters
75+
int in_mdx_expr; // Currently inside a multi-line MDX expression
76+
int mdx_expr_depth; // Nested brace depth for MDX expressions
77+
char mdx_expr_quote; // Active quote inside MDX expression
78+
int mdx_expr_block_comment;
79+
int in_mdx_esm; // Currently skipping a multi-line ESM statement
80+
int mdx_esm_depth; // Nested delimiter depth for ESM statements
81+
char mdx_esm_quote; // Active quote inside ESM statement
82+
int mdx_esm_block_comment;
7383
} strip_ctx_t;
7484

7585
// MARK: - Helpers -
@@ -162,6 +172,255 @@ static const char *skip_until (const char *p, const char *end, char c) {
162172
return (p < end) ? p + 1 : p;
163173
}
164174

175+
static const char *skip_spaces_tabs (const char *p, const char *end) {
176+
while (p < end && (*p == ' ' || *p == '\t')) ++p;
177+
return p;
178+
}
179+
180+
static int is_ident_continue (char c) {
181+
return (isalnum((unsigned char)c) || c == '_' || c == '$');
182+
}
183+
184+
static int starts_keyword (const char *p, const char *end, const char *kw) {
185+
size_t n = strlen(kw);
186+
if ((size_t)(end - p) < n) return 0;
187+
if (strncmp(p, kw, n) != 0) return 0;
188+
return (p + n >= end || !is_ident_continue(p[n]));
189+
}
190+
191+
static const char *skip_identifier (const char *p, const char *end) {
192+
if (p >= end || !is_ident_continue(*p)) return NULL;
193+
while (p < end && is_ident_continue(*p)) ++p;
194+
return p;
195+
}
196+
197+
static const char *mdx_top_level_start (const char *p, const char *end) {
198+
int spaces = 0;
199+
while (p < end && *p == ' ') { ++p; ++spaces; }
200+
if (spaces > 3 || (p < end && *p == '\t')) return NULL;
201+
return p;
202+
}
203+
204+
static int mdx_has_from_keyword (const char *p, const char *end) {
205+
char quote = 0;
206+
int in_block_comment = 0;
207+
208+
while (p < end) {
209+
char c = *p;
210+
211+
if (quote) {
212+
if (c == '\\' && p + 1 < end) {
213+
p += 2;
214+
continue;
215+
}
216+
if (c == quote) quote = 0;
217+
++p;
218+
continue;
219+
}
220+
221+
if (in_block_comment) {
222+
if (c == '*' && p + 1 < end && p[1] == '/') {
223+
in_block_comment = 0;
224+
p += 2;
225+
continue;
226+
}
227+
++p;
228+
continue;
229+
}
230+
231+
if (c == '/' && p + 1 < end) {
232+
if (p[1] == '/') break;
233+
if (p[1] == '*') {
234+
in_block_comment = 1;
235+
p += 2;
236+
continue;
237+
}
238+
}
239+
240+
if (c == '\'' || c == '"' || c == '`') {
241+
quote = c;
242+
++p;
243+
continue;
244+
}
245+
246+
if (starts_keyword(p, end, "from")) return 1;
247+
++p;
248+
}
249+
250+
return 0;
251+
}
252+
253+
static int mdx_line_starts_esm (const char *p, const char *end) {
254+
p = mdx_top_level_start(p, end);
255+
if (!p) return 0;
256+
257+
if (starts_keyword(p, end, "import")) {
258+
p += strlen("import");
259+
if (p >= end || (*p != ' ' && *p != '\t')) return 0;
260+
p = skip_spaces_tabs(p, end);
261+
if (p < end && (*p == '\'' || *p == '"')) return 1;
262+
if (p >= end) return 0;
263+
if (*p == '{' || *p == '*') return mdx_has_from_keyword(p, end);
264+
265+
const char *ident_end = skip_identifier(p, end);
266+
if (!ident_end) return 0;
267+
268+
if ((size_t)(ident_end - p) == 4 && strncmp(p, "type", 4) == 0) {
269+
p = skip_spaces_tabs(ident_end, end);
270+
if (p < end && (*p == '{' || *p == '*')) return mdx_has_from_keyword(p, end);
271+
ident_end = skip_identifier(p, end);
272+
if (!ident_end) return 0;
273+
}
274+
275+
p = skip_spaces_tabs(ident_end, end);
276+
if (starts_keyword(p, end, "from")) return 1;
277+
if (p < end && *p == ',') return mdx_has_from_keyword(p + 1, end);
278+
return 0;
279+
}
280+
281+
if (starts_keyword(p, end, "export")) {
282+
p += strlen("export");
283+
if (p >= end || (*p != ' ' && *p != '\t')) return 0;
284+
p = skip_spaces_tabs(p, end);
285+
if (p >= end) return 0;
286+
if (*p == '{' || *p == '*') return 1;
287+
return starts_keyword(p, end, "const") ||
288+
starts_keyword(p, end, "let") ||
289+
starts_keyword(p, end, "var") ||
290+
starts_keyword(p, end, "function") ||
291+
starts_keyword(p, end, "class") ||
292+
starts_keyword(p, end, "default") ||
293+
starts_keyword(p, end, "async") ||
294+
starts_keyword(p, end, "type") ||
295+
starts_keyword(p, end, "interface") ||
296+
starts_keyword(p, end, "enum");
297+
}
298+
299+
return 0;
300+
}
301+
302+
static int mdx_update_esm_state (strip_ctx_t *ctx, const char *p, const char *end) {
303+
int saw_semicolon = 0;
304+
305+
while (p < end) {
306+
char c = *p;
307+
308+
if (ctx->mdx_esm_quote) {
309+
if (c == '\\' && p + 1 < end) {
310+
p += 2;
311+
continue;
312+
}
313+
if (c == ctx->mdx_esm_quote) ctx->mdx_esm_quote = 0;
314+
++p;
315+
continue;
316+
}
317+
318+
if (ctx->mdx_esm_block_comment) {
319+
if (c == '*' && p + 1 < end && p[1] == '/') {
320+
ctx->mdx_esm_block_comment = 0;
321+
p += 2;
322+
continue;
323+
}
324+
++p;
325+
continue;
326+
}
327+
328+
if (c == '/' && p + 1 < end) {
329+
if (p[1] == '/') break;
330+
if (p[1] == '*') {
331+
ctx->mdx_esm_block_comment = 1;
332+
p += 2;
333+
continue;
334+
}
335+
}
336+
337+
if (c == '\'' || c == '"' || c == '`') {
338+
ctx->mdx_esm_quote = c;
339+
++p;
340+
continue;
341+
}
342+
343+
if (c == '(' || c == '[' || c == '{') {
344+
ctx->mdx_esm_depth++;
345+
} else if (c == ')' || c == ']' || c == '}') {
346+
if (ctx->mdx_esm_depth > 0) ctx->mdx_esm_depth--;
347+
} else if (c == ';' && ctx->mdx_esm_depth == 0) {
348+
saw_semicolon = 1;
349+
}
350+
351+
++p;
352+
}
353+
354+
if (ctx->mdx_esm_quote || ctx->mdx_esm_block_comment || ctx->mdx_esm_depth > 0) return 0;
355+
return saw_semicolon || ctx->mdx_esm_depth == 0;
356+
}
357+
358+
static const char *mdx_skip_expression (strip_ctx_t *ctx, const char *p, const char *end) {
359+
if (!ctx->in_mdx_expr) {
360+
ctx->in_mdx_expr = 1;
361+
ctx->mdx_expr_depth = 1;
362+
ctx->mdx_expr_quote = 0;
363+
ctx->mdx_expr_block_comment = 0;
364+
++p;
365+
}
366+
367+
while (p < end) {
368+
char c = *p;
369+
370+
if (ctx->mdx_expr_quote) {
371+
if (c == '\\' && p + 1 < end) {
372+
p += 2;
373+
continue;
374+
}
375+
if (c == ctx->mdx_expr_quote) ctx->mdx_expr_quote = 0;
376+
++p;
377+
continue;
378+
}
379+
380+
if (ctx->mdx_expr_block_comment) {
381+
if (c == '*' && p + 1 < end && p[1] == '/') {
382+
ctx->mdx_expr_block_comment = 0;
383+
p += 2;
384+
continue;
385+
}
386+
++p;
387+
continue;
388+
}
389+
390+
if (c == '/' && p + 1 < end) {
391+
if (p[1] == '/') break;
392+
if (p[1] == '*') {
393+
ctx->mdx_expr_block_comment = 1;
394+
p += 2;
395+
continue;
396+
}
397+
}
398+
399+
if (c == '\'' || c == '"' || c == '`') {
400+
ctx->mdx_expr_quote = c;
401+
++p;
402+
continue;
403+
}
404+
405+
if (c == '{') {
406+
ctx->mdx_expr_depth++;
407+
} else if (c == '}') {
408+
ctx->mdx_expr_depth--;
409+
++p;
410+
if (ctx->mdx_expr_depth <= 0) {
411+
ctx->in_mdx_expr = 0;
412+
ctx->mdx_expr_depth = 0;
413+
return p;
414+
}
415+
continue;
416+
}
417+
418+
++p;
419+
}
420+
421+
return p;
422+
}
423+
165424
// Check if line starts a fenced code block. Returns fence width (0 if not a fence).
166425
static int check_fence_start (const char *p, const char *end, char *out_char) {
167426
p = skip_leading_spaces(p, end, 3);
@@ -325,6 +584,25 @@ static void process_inline (strip_ctx_t *ctx, const char *start, const char *end
325584
const char *p = start;
326585

327586
while (p < end) {
587+
// Continue skipping a multi-line MDX expression.
588+
if (ctx->mdx_mode && ctx->in_mdx_expr) {
589+
p = mdx_skip_expression(ctx, p, end);
590+
continue;
591+
}
592+
593+
// Escaped opening braces are literal text in MDX.
594+
if (ctx->mdx_mode && *p == '\\' && p + 1 < end && p[1] == '{') {
595+
ctx->buf[ctx->wp++] = '{';
596+
p += 2;
597+
continue;
598+
}
599+
600+
// MDX expression: remove the JavaScript expression but keep nearby text.
601+
if (ctx->mdx_mode && *p == '{') {
602+
p = mdx_skip_expression(ctx, p, end);
603+
continue;
604+
}
605+
328606
// HTML tags
329607
if (ctx->skip_html && *p == '<') {
330608
const char *gt = p + 1;
@@ -387,18 +665,27 @@ static void process_inline (strip_ctx_t *ctx, const char *start, const char *end
387665
}
388666

389667
// Main markdown stripping function
390-
static char *strip_markdown (const char *src, size_t len, size_t *out_len, bool skip_html) {
668+
static char *strip_markdown (const char *src, size_t len, size_t *out_len, bool skip_html, bool mdx_mode) {
391669
char *buf = (char *)dbmemory_alloc(len + 1);
392670
if (!buf) return NULL;
393671

394672
strip_ctx_t ctx = {
395673
.buf = buf,
396674
.wp = 0,
397675
.skip_html = skip_html,
676+
.mdx_mode = mdx_mode,
398677
.in_html_tag = 0,
399678
.in_fenced_code = 0,
400679
.fence_char = 0,
401-
.fence_width = 0
680+
.fence_width = 0,
681+
.in_mdx_expr = 0,
682+
.mdx_expr_depth = 0,
683+
.mdx_expr_quote = 0,
684+
.mdx_expr_block_comment = 0,
685+
.in_mdx_esm = 0,
686+
.mdx_esm_depth = 0,
687+
.mdx_esm_quote = 0,
688+
.mdx_esm_block_comment = 0
402689
};
403690

404691
const char *p = src;
@@ -451,6 +738,21 @@ static char *strip_markdown (const char *src, size_t len, size_t *out_len, bool
451738
continue;
452739
}
453740

741+
// MDX top-level ESM (import/export) is scaffolding, not searchable prose.
742+
if (ctx.mdx_mode) {
743+
if (ctx.in_mdx_esm) {
744+
if (mdx_update_esm_state(&ctx, p, line_end)) ctx.in_mdx_esm = 0;
745+
p = (nl < end) ? nl + 1 : nl;
746+
continue;
747+
}
748+
749+
if (mdx_line_starts_esm(p, line_end)) {
750+
ctx.in_mdx_esm = !mdx_update_esm_state(&ctx, p, line_end);
751+
p = (nl < end) ? nl + 1 : nl;
752+
continue;
753+
}
754+
}
755+
454756
// Blank lines
455757
const char *tp = p;
456758
while (tp < line_end && (*tp == ' ' || *tp == '\t')) ++tp;
@@ -630,10 +932,10 @@ static int parse_sections (const char *buffer, size_t buffer_size, bool skip_sem
630932
}
631933

632934
// Strip markdown from all sections
633-
static int strip_sections (parse_ctx_t *ctx, const char *buffer, bool skip_html) {
935+
static int strip_sections (parse_ctx_t *ctx, const char *buffer, bool skip_html, bool mdx_mode) {
634936
for (size_t i = 0; i < ctx->sec_count; i++) {
635937
section_t *s = &ctx->sections[i];
636-
s->text = strip_markdown(buffer + s->start, s->end - s->start, &s->text_len, skip_html);
938+
s->text = strip_markdown(buffer + s->start, s->end - s->start, &s->text_len, skip_html, mdx_mode);
637939
if (!s->text) {
638940
// Free previously allocated texts and set to NULL to avoid double-free
639941
for (size_t j = 0; j < i; j++) {
@@ -761,7 +1063,7 @@ int dbmem_parse (const char *md, size_t md_len, dbmem_parse_settings *settings)
7611063
}
7621064

7631065
// 2. Strip markdown from sections
764-
if (strip_sections(&ctx, md, settings->skip_html) != 0) {
1066+
if (strip_sections(&ctx, md, settings->skip_html, settings->mdx_mode) != 0) {
7651067
free_sections(&ctx);
7661068
return -1;
7671069
}

src/dbmem-parser.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ typedef struct {
2020
size_t chars_per_token; // estimated number of characters per token
2121
bool skip_semantic; // if true, do not semantically parse MD file
2222
bool skip_html; // if true, remove html tags
23+
bool mdx_mode; // if true, strip MDX ESM and expressions
2324
} dbmem_parse_settings;
2425

2526
int dbmem_parse (const char *md, size_t md_len, dbmem_parse_settings *settings);

src/sqlite-memory.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1456,6 +1456,7 @@ static int dbmem_process_buffer (dbmem_context *ctx, const char *buffer, int64_t
14561456
settings.overlay_tokens = ctx->overlay_tokens;
14571457
settings.skip_semantic = ctx->skip_semantic;
14581458
settings.skip_html = ctx->skip_html;
1459+
settings.mdx_mode = (ctx->path && dbmem_file_has_extension(ctx->path, "mdx"));
14591460

14601461
sqlite3 *db = ctx->db;
14611462
int rc = dbmem_database_begin_transaction(db);

0 commit comments

Comments
 (0)