|
21 | 21 | #include "dbmem-utils.h" |
22 | 22 | #include "md4c.h" |
23 | 23 |
|
| 24 | +#include <ctype.h> |
24 | 25 | #include <stdio.h> |
25 | 26 | #include <string.h> |
26 | 27 |
|
@@ -66,10 +67,19 @@ typedef struct { |
66 | 67 | size_t wp; // Write position in buffer |
67 | 68 | const char *line_end; // End of current line |
68 | 69 | bool skip_html; // Whether to skip HTML tags |
| 70 | + bool mdx_mode; // Whether to strip MDX-specific syntax |
69 | 71 | int in_html_tag; // Currently inside multi-line HTML tag |
70 | 72 | int in_fenced_code; // Currently inside fenced code block |
71 | 73 | char fence_char; // Fence character (` or ~) |
72 | 74 | int fence_width; // Number of fence characters |
| 75 | + int in_mdx_expr; // Currently inside a multi-line MDX expression |
| 76 | + int mdx_expr_depth; // Nested brace depth for MDX expressions |
| 77 | + char mdx_expr_quote; // Active quote inside MDX expression |
| 78 | + int mdx_expr_block_comment; |
| 79 | + int in_mdx_esm; // Currently skipping a multi-line ESM statement |
| 80 | + int mdx_esm_depth; // Nested delimiter depth for ESM statements |
| 81 | + char mdx_esm_quote; // Active quote inside ESM statement |
| 82 | + int mdx_esm_block_comment; |
73 | 83 | } strip_ctx_t; |
74 | 84 |
|
75 | 85 | // MARK: - Helpers - |
@@ -162,6 +172,255 @@ static const char *skip_until (const char *p, const char *end, char c) { |
162 | 172 | return (p < end) ? p + 1 : p; |
163 | 173 | } |
164 | 174 |
|
| 175 | +static const char *skip_spaces_tabs (const char *p, const char *end) { |
| 176 | + while (p < end && (*p == ' ' || *p == '\t')) ++p; |
| 177 | + return p; |
| 178 | +} |
| 179 | + |
| 180 | +static int is_ident_continue (char c) { |
| 181 | + return (isalnum((unsigned char)c) || c == '_' || c == '$'); |
| 182 | +} |
| 183 | + |
| 184 | +static int starts_keyword (const char *p, const char *end, const char *kw) { |
| 185 | + size_t n = strlen(kw); |
| 186 | + if ((size_t)(end - p) < n) return 0; |
| 187 | + if (strncmp(p, kw, n) != 0) return 0; |
| 188 | + return (p + n >= end || !is_ident_continue(p[n])); |
| 189 | +} |
| 190 | + |
| 191 | +static const char *skip_identifier (const char *p, const char *end) { |
| 192 | + if (p >= end || !is_ident_continue(*p)) return NULL; |
| 193 | + while (p < end && is_ident_continue(*p)) ++p; |
| 194 | + return p; |
| 195 | +} |
| 196 | + |
| 197 | +static const char *mdx_top_level_start (const char *p, const char *end) { |
| 198 | + int spaces = 0; |
| 199 | + while (p < end && *p == ' ') { ++p; ++spaces; } |
| 200 | + if (spaces > 3 || (p < end && *p == '\t')) return NULL; |
| 201 | + return p; |
| 202 | +} |
| 203 | + |
| 204 | +static int mdx_has_from_keyword (const char *p, const char *end) { |
| 205 | + char quote = 0; |
| 206 | + int in_block_comment = 0; |
| 207 | + |
| 208 | + while (p < end) { |
| 209 | + char c = *p; |
| 210 | + |
| 211 | + if (quote) { |
| 212 | + if (c == '\\' && p + 1 < end) { |
| 213 | + p += 2; |
| 214 | + continue; |
| 215 | + } |
| 216 | + if (c == quote) quote = 0; |
| 217 | + ++p; |
| 218 | + continue; |
| 219 | + } |
| 220 | + |
| 221 | + if (in_block_comment) { |
| 222 | + if (c == '*' && p + 1 < end && p[1] == '/') { |
| 223 | + in_block_comment = 0; |
| 224 | + p += 2; |
| 225 | + continue; |
| 226 | + } |
| 227 | + ++p; |
| 228 | + continue; |
| 229 | + } |
| 230 | + |
| 231 | + if (c == '/' && p + 1 < end) { |
| 232 | + if (p[1] == '/') break; |
| 233 | + if (p[1] == '*') { |
| 234 | + in_block_comment = 1; |
| 235 | + p += 2; |
| 236 | + continue; |
| 237 | + } |
| 238 | + } |
| 239 | + |
| 240 | + if (c == '\'' || c == '"' || c == '`') { |
| 241 | + quote = c; |
| 242 | + ++p; |
| 243 | + continue; |
| 244 | + } |
| 245 | + |
| 246 | + if (starts_keyword(p, end, "from")) return 1; |
| 247 | + ++p; |
| 248 | + } |
| 249 | + |
| 250 | + return 0; |
| 251 | +} |
| 252 | + |
| 253 | +static int mdx_line_starts_esm (const char *p, const char *end) { |
| 254 | + p = mdx_top_level_start(p, end); |
| 255 | + if (!p) return 0; |
| 256 | + |
| 257 | + if (starts_keyword(p, end, "import")) { |
| 258 | + p += strlen("import"); |
| 259 | + if (p >= end || (*p != ' ' && *p != '\t')) return 0; |
| 260 | + p = skip_spaces_tabs(p, end); |
| 261 | + if (p < end && (*p == '\'' || *p == '"')) return 1; |
| 262 | + if (p >= end) return 0; |
| 263 | + if (*p == '{' || *p == '*') return mdx_has_from_keyword(p, end); |
| 264 | + |
| 265 | + const char *ident_end = skip_identifier(p, end); |
| 266 | + if (!ident_end) return 0; |
| 267 | + |
| 268 | + if ((size_t)(ident_end - p) == 4 && strncmp(p, "type", 4) == 0) { |
| 269 | + p = skip_spaces_tabs(ident_end, end); |
| 270 | + if (p < end && (*p == '{' || *p == '*')) return mdx_has_from_keyword(p, end); |
| 271 | + ident_end = skip_identifier(p, end); |
| 272 | + if (!ident_end) return 0; |
| 273 | + } |
| 274 | + |
| 275 | + p = skip_spaces_tabs(ident_end, end); |
| 276 | + if (starts_keyword(p, end, "from")) return 1; |
| 277 | + if (p < end && *p == ',') return mdx_has_from_keyword(p + 1, end); |
| 278 | + return 0; |
| 279 | + } |
| 280 | + |
| 281 | + if (starts_keyword(p, end, "export")) { |
| 282 | + p += strlen("export"); |
| 283 | + if (p >= end || (*p != ' ' && *p != '\t')) return 0; |
| 284 | + p = skip_spaces_tabs(p, end); |
| 285 | + if (p >= end) return 0; |
| 286 | + if (*p == '{' || *p == '*') return 1; |
| 287 | + return starts_keyword(p, end, "const") || |
| 288 | + starts_keyword(p, end, "let") || |
| 289 | + starts_keyword(p, end, "var") || |
| 290 | + starts_keyword(p, end, "function") || |
| 291 | + starts_keyword(p, end, "class") || |
| 292 | + starts_keyword(p, end, "default") || |
| 293 | + starts_keyword(p, end, "async") || |
| 294 | + starts_keyword(p, end, "type") || |
| 295 | + starts_keyword(p, end, "interface") || |
| 296 | + starts_keyword(p, end, "enum"); |
| 297 | + } |
| 298 | + |
| 299 | + return 0; |
| 300 | +} |
| 301 | + |
| 302 | +static int mdx_update_esm_state (strip_ctx_t *ctx, const char *p, const char *end) { |
| 303 | + int saw_semicolon = 0; |
| 304 | + |
| 305 | + while (p < end) { |
| 306 | + char c = *p; |
| 307 | + |
| 308 | + if (ctx->mdx_esm_quote) { |
| 309 | + if (c == '\\' && p + 1 < end) { |
| 310 | + p += 2; |
| 311 | + continue; |
| 312 | + } |
| 313 | + if (c == ctx->mdx_esm_quote) ctx->mdx_esm_quote = 0; |
| 314 | + ++p; |
| 315 | + continue; |
| 316 | + } |
| 317 | + |
| 318 | + if (ctx->mdx_esm_block_comment) { |
| 319 | + if (c == '*' && p + 1 < end && p[1] == '/') { |
| 320 | + ctx->mdx_esm_block_comment = 0; |
| 321 | + p += 2; |
| 322 | + continue; |
| 323 | + } |
| 324 | + ++p; |
| 325 | + continue; |
| 326 | + } |
| 327 | + |
| 328 | + if (c == '/' && p + 1 < end) { |
| 329 | + if (p[1] == '/') break; |
| 330 | + if (p[1] == '*') { |
| 331 | + ctx->mdx_esm_block_comment = 1; |
| 332 | + p += 2; |
| 333 | + continue; |
| 334 | + } |
| 335 | + } |
| 336 | + |
| 337 | + if (c == '\'' || c == '"' || c == '`') { |
| 338 | + ctx->mdx_esm_quote = c; |
| 339 | + ++p; |
| 340 | + continue; |
| 341 | + } |
| 342 | + |
| 343 | + if (c == '(' || c == '[' || c == '{') { |
| 344 | + ctx->mdx_esm_depth++; |
| 345 | + } else if (c == ')' || c == ']' || c == '}') { |
| 346 | + if (ctx->mdx_esm_depth > 0) ctx->mdx_esm_depth--; |
| 347 | + } else if (c == ';' && ctx->mdx_esm_depth == 0) { |
| 348 | + saw_semicolon = 1; |
| 349 | + } |
| 350 | + |
| 351 | + ++p; |
| 352 | + } |
| 353 | + |
| 354 | + if (ctx->mdx_esm_quote || ctx->mdx_esm_block_comment || ctx->mdx_esm_depth > 0) return 0; |
| 355 | + return saw_semicolon || ctx->mdx_esm_depth == 0; |
| 356 | +} |
| 357 | + |
| 358 | +static const char *mdx_skip_expression (strip_ctx_t *ctx, const char *p, const char *end) { |
| 359 | + if (!ctx->in_mdx_expr) { |
| 360 | + ctx->in_mdx_expr = 1; |
| 361 | + ctx->mdx_expr_depth = 1; |
| 362 | + ctx->mdx_expr_quote = 0; |
| 363 | + ctx->mdx_expr_block_comment = 0; |
| 364 | + ++p; |
| 365 | + } |
| 366 | + |
| 367 | + while (p < end) { |
| 368 | + char c = *p; |
| 369 | + |
| 370 | + if (ctx->mdx_expr_quote) { |
| 371 | + if (c == '\\' && p + 1 < end) { |
| 372 | + p += 2; |
| 373 | + continue; |
| 374 | + } |
| 375 | + if (c == ctx->mdx_expr_quote) ctx->mdx_expr_quote = 0; |
| 376 | + ++p; |
| 377 | + continue; |
| 378 | + } |
| 379 | + |
| 380 | + if (ctx->mdx_expr_block_comment) { |
| 381 | + if (c == '*' && p + 1 < end && p[1] == '/') { |
| 382 | + ctx->mdx_expr_block_comment = 0; |
| 383 | + p += 2; |
| 384 | + continue; |
| 385 | + } |
| 386 | + ++p; |
| 387 | + continue; |
| 388 | + } |
| 389 | + |
| 390 | + if (c == '/' && p + 1 < end) { |
| 391 | + if (p[1] == '/') break; |
| 392 | + if (p[1] == '*') { |
| 393 | + ctx->mdx_expr_block_comment = 1; |
| 394 | + p += 2; |
| 395 | + continue; |
| 396 | + } |
| 397 | + } |
| 398 | + |
| 399 | + if (c == '\'' || c == '"' || c == '`') { |
| 400 | + ctx->mdx_expr_quote = c; |
| 401 | + ++p; |
| 402 | + continue; |
| 403 | + } |
| 404 | + |
| 405 | + if (c == '{') { |
| 406 | + ctx->mdx_expr_depth++; |
| 407 | + } else if (c == '}') { |
| 408 | + ctx->mdx_expr_depth--; |
| 409 | + ++p; |
| 410 | + if (ctx->mdx_expr_depth <= 0) { |
| 411 | + ctx->in_mdx_expr = 0; |
| 412 | + ctx->mdx_expr_depth = 0; |
| 413 | + return p; |
| 414 | + } |
| 415 | + continue; |
| 416 | + } |
| 417 | + |
| 418 | + ++p; |
| 419 | + } |
| 420 | + |
| 421 | + return p; |
| 422 | +} |
| 423 | + |
165 | 424 | // Check if line starts a fenced code block. Returns fence width (0 if not a fence). |
166 | 425 | static int check_fence_start (const char *p, const char *end, char *out_char) { |
167 | 426 | p = skip_leading_spaces(p, end, 3); |
@@ -325,6 +584,25 @@ static void process_inline (strip_ctx_t *ctx, const char *start, const char *end |
325 | 584 | const char *p = start; |
326 | 585 |
|
327 | 586 | while (p < end) { |
| 587 | + // Continue skipping a multi-line MDX expression. |
| 588 | + if (ctx->mdx_mode && ctx->in_mdx_expr) { |
| 589 | + p = mdx_skip_expression(ctx, p, end); |
| 590 | + continue; |
| 591 | + } |
| 592 | + |
| 593 | + // Escaped opening braces are literal text in MDX. |
| 594 | + if (ctx->mdx_mode && *p == '\\' && p + 1 < end && p[1] == '{') { |
| 595 | + ctx->buf[ctx->wp++] = '{'; |
| 596 | + p += 2; |
| 597 | + continue; |
| 598 | + } |
| 599 | + |
| 600 | + // MDX expression: remove the JavaScript expression but keep nearby text. |
| 601 | + if (ctx->mdx_mode && *p == '{') { |
| 602 | + p = mdx_skip_expression(ctx, p, end); |
| 603 | + continue; |
| 604 | + } |
| 605 | + |
328 | 606 | // HTML tags |
329 | 607 | if (ctx->skip_html && *p == '<') { |
330 | 608 | const char *gt = p + 1; |
@@ -387,18 +665,27 @@ static void process_inline (strip_ctx_t *ctx, const char *start, const char *end |
387 | 665 | } |
388 | 666 |
|
389 | 667 | // Main markdown stripping function |
390 | | -static char *strip_markdown (const char *src, size_t len, size_t *out_len, bool skip_html) { |
| 668 | +static char *strip_markdown (const char *src, size_t len, size_t *out_len, bool skip_html, bool mdx_mode) { |
391 | 669 | char *buf = (char *)dbmemory_alloc(len + 1); |
392 | 670 | if (!buf) return NULL; |
393 | 671 |
|
394 | 672 | strip_ctx_t ctx = { |
395 | 673 | .buf = buf, |
396 | 674 | .wp = 0, |
397 | 675 | .skip_html = skip_html, |
| 676 | + .mdx_mode = mdx_mode, |
398 | 677 | .in_html_tag = 0, |
399 | 678 | .in_fenced_code = 0, |
400 | 679 | .fence_char = 0, |
401 | | - .fence_width = 0 |
| 680 | + .fence_width = 0, |
| 681 | + .in_mdx_expr = 0, |
| 682 | + .mdx_expr_depth = 0, |
| 683 | + .mdx_expr_quote = 0, |
| 684 | + .mdx_expr_block_comment = 0, |
| 685 | + .in_mdx_esm = 0, |
| 686 | + .mdx_esm_depth = 0, |
| 687 | + .mdx_esm_quote = 0, |
| 688 | + .mdx_esm_block_comment = 0 |
402 | 689 | }; |
403 | 690 |
|
404 | 691 | const char *p = src; |
@@ -451,6 +738,21 @@ static char *strip_markdown (const char *src, size_t len, size_t *out_len, bool |
451 | 738 | continue; |
452 | 739 | } |
453 | 740 |
|
| 741 | + // MDX top-level ESM (import/export) is scaffolding, not searchable prose. |
| 742 | + if (ctx.mdx_mode) { |
| 743 | + if (ctx.in_mdx_esm) { |
| 744 | + if (mdx_update_esm_state(&ctx, p, line_end)) ctx.in_mdx_esm = 0; |
| 745 | + p = (nl < end) ? nl + 1 : nl; |
| 746 | + continue; |
| 747 | + } |
| 748 | + |
| 749 | + if (mdx_line_starts_esm(p, line_end)) { |
| 750 | + ctx.in_mdx_esm = !mdx_update_esm_state(&ctx, p, line_end); |
| 751 | + p = (nl < end) ? nl + 1 : nl; |
| 752 | + continue; |
| 753 | + } |
| 754 | + } |
| 755 | + |
454 | 756 | // Blank lines |
455 | 757 | const char *tp = p; |
456 | 758 | while (tp < line_end && (*tp == ' ' || *tp == '\t')) ++tp; |
@@ -630,10 +932,10 @@ static int parse_sections (const char *buffer, size_t buffer_size, bool skip_sem |
630 | 932 | } |
631 | 933 |
|
632 | 934 | // Strip markdown from all sections |
633 | | -static int strip_sections (parse_ctx_t *ctx, const char *buffer, bool skip_html) { |
| 935 | +static int strip_sections (parse_ctx_t *ctx, const char *buffer, bool skip_html, bool mdx_mode) { |
634 | 936 | for (size_t i = 0; i < ctx->sec_count; i++) { |
635 | 937 | section_t *s = &ctx->sections[i]; |
636 | | - s->text = strip_markdown(buffer + s->start, s->end - s->start, &s->text_len, skip_html); |
| 938 | + s->text = strip_markdown(buffer + s->start, s->end - s->start, &s->text_len, skip_html, mdx_mode); |
637 | 939 | if (!s->text) { |
638 | 940 | // Free previously allocated texts and set to NULL to avoid double-free |
639 | 941 | for (size_t j = 0; j < i; j++) { |
@@ -761,7 +1063,7 @@ int dbmem_parse (const char *md, size_t md_len, dbmem_parse_settings *settings) |
761 | 1063 | } |
762 | 1064 |
|
763 | 1065 | // 2. Strip markdown from sections |
764 | | - if (strip_sections(&ctx, md, settings->skip_html) != 0) { |
| 1066 | + if (strip_sections(&ctx, md, settings->skip_html, settings->mdx_mode) != 0) { |
765 | 1067 | free_sections(&ctx); |
766 | 1068 | return -1; |
767 | 1069 | } |
|
0 commit comments