|
2 | 2 | import configparser |
3 | 3 | import logging |
4 | 4 | from pathlib import Path |
| 5 | +import re |
5 | 6 | from typing import TypedDict |
6 | 7 | from urllib.request import pathname2url |
7 | 8 |
|
@@ -355,6 +356,170 @@ class ExtractedRstType(TypedDict): |
355 | 356 | end_idx: int |
356 | 357 |
|
357 | 358 |
|
| 359 | +class ParsedDirective(TypedDict): |
| 360 | + """A single parsed RST directive.""" |
| 361 | + |
| 362 | + name: str |
| 363 | + argument: str |
| 364 | + options: dict[str, str] |
| 365 | + content: str |
| 366 | + has_extra_content: bool |
| 367 | + directive_line_offset: int |
| 368 | + """0-based line index of the ``.. name::`` line within the input text.""" |
| 369 | + content_line_offset: int | None |
| 370 | + """0-based line index where the directive content starts within the input text. |
| 371 | +
|
| 372 | + ``None`` if the directive has no content body. |
| 373 | + """ |
| 374 | + |
| 375 | + |
| 376 | +_RE_DIRECTIVE = re.compile(r"^(\s*)\.\.\s+([\w:.+-]+)\s*::\s*(.*)") |
| 377 | +_RE_OPTION = re.compile(r"^\s+:([^:]+):\s*(.*)") |
| 378 | + |
| 379 | + |
| 380 | +def _parse_options(body_lines: list[str]) -> tuple[dict[str, str], int]: |
| 381 | + """Parse field-list options from the start of directive body lines. |
| 382 | +
|
| 383 | + Supports multi-line option values: continuation lines must be indented |
| 384 | + and are joined with a single space. |
| 385 | +
|
| 386 | + :return: Tuple of (options dict, content_start index into body_lines). |
| 387 | + """ |
| 388 | + options: dict[str, str] = {} |
| 389 | + content_start = 0 |
| 390 | + current_key: str | None = None |
| 391 | + for j, line in enumerate(body_lines): |
| 392 | + if not line.strip(): |
| 393 | + # Blank line ends the option block. |
| 394 | + content_start = j + 1 |
| 395 | + current_key = None |
| 396 | + break |
| 397 | + opt_match = _RE_OPTION.match(line) |
| 398 | + if opt_match: |
| 399 | + current_key = opt_match.group(1).strip() |
| 400 | + options[current_key] = opt_match.group(2).strip() |
| 401 | + content_start = j + 1 |
| 402 | + elif current_key is not None and line[:1] == " ": |
| 403 | + # Continuation line for the previous option value. |
| 404 | + # NOTE: In standard RST (docutils), |
| 405 | + # continuation indent is measured relative to the field body |
| 406 | + # start. Here any leading space is accepted, which is looser |
| 407 | + # but correct within a directive body where all lines are |
| 408 | + # already indented past the directive marker. |
| 409 | + prev = options[current_key] |
| 410 | + continuation = line.strip() |
| 411 | + options[current_key] = f"{prev} {continuation}" if prev else continuation |
| 412 | + content_start = j + 1 |
| 413 | + else: |
| 414 | + content_start = j |
| 415 | + break |
| 416 | + else: |
| 417 | + content_start = len(body_lines) |
| 418 | + return options, content_start |
| 419 | + |
| 420 | + |
| 421 | +def _extract_content( |
| 422 | + body_lines: list[str], content_start: int |
| 423 | +) -> tuple[list[str], int]: |
| 424 | + """Extract and dedent the content portion of a directive body. |
| 425 | +
|
| 426 | + :return: Tuple of (dedented content lines, number of leading blank lines removed). |
| 427 | + """ |
| 428 | + content_lines = body_lines[content_start:] |
| 429 | + content_blanks_removed = 0 |
| 430 | + while content_lines and not content_lines[0].strip(): |
| 431 | + content_lines.pop(0) |
| 432 | + content_blanks_removed += 1 |
| 433 | + while content_lines and not content_lines[-1].strip(): |
| 434 | + content_lines.pop() |
| 435 | + if content_lines: |
| 436 | + min_indent = min( |
| 437 | + len(cl) - len(cl.lstrip()) for cl in content_lines if cl.strip() |
| 438 | + ) |
| 439 | + content_lines = [cl[min_indent:] if cl.strip() else "" for cl in content_lines] |
| 440 | + return content_lines, content_blanks_removed |
| 441 | + |
| 442 | + |
| 443 | +def parse_single_directive(rst_text: str) -> ParsedDirective | None: |
| 444 | + """Parse a single RST directive from text. |
| 445 | +
|
| 446 | + Expects text whose first non-blank line is a directive, e.g.:: |
| 447 | +
|
| 448 | + .. need-type:: argument |
| 449 | + :option: value |
| 450 | +
|
| 451 | + Content body here. |
| 452 | +
|
| 453 | + :param rst_text: The RST text to parse. |
| 454 | + :return: Parsed directive, or ``None`` if the first non-blank line |
| 455 | + is not a directive. |
| 456 | + """ |
| 457 | + lines = rst_text.splitlines() |
| 458 | + |
| 459 | + # Find directive on the first non-blank line |
| 460 | + dir_idx: int | None = None |
| 461 | + dir_match: re.Match[str] | None = None |
| 462 | + for i, line in enumerate(lines): |
| 463 | + if line.strip(): |
| 464 | + dir_match = _RE_DIRECTIVE.match(line) |
| 465 | + if dir_match: |
| 466 | + dir_idx = i |
| 467 | + break |
| 468 | + |
| 469 | + if dir_idx is None or dir_match is None: |
| 470 | + return None |
| 471 | + |
| 472 | + dir_indent = len(dir_match.group(1)) |
| 473 | + name = dir_match.group(2) |
| 474 | + # NOTE: In standard RST (docutils), directive |
| 475 | + # arguments may span multiple lines before the first field-list |
| 476 | + # marker. Here only the ``.. name::`` line is captured; this is |
| 477 | + # sufficient for NeedDirective where the argument is a single-line |
| 478 | + # title. |
| 479 | + argument = dir_match.group(3).strip() |
| 480 | + |
| 481 | + # Collect body: indented (or blank) lines after the directive. |
| 482 | + # body_end tracks the last non-blank indented line so trailing |
| 483 | + # blank lines between the directive and outside content are excluded. |
| 484 | + body_end = dir_idx |
| 485 | + for i in range(dir_idx + 1, len(lines)): |
| 486 | + line = lines[i] |
| 487 | + if not line.strip(): |
| 488 | + continue |
| 489 | + if len(line) - len(line.lstrip()) > dir_indent: |
| 490 | + body_end = i |
| 491 | + else: |
| 492 | + break |
| 493 | + |
| 494 | + body_lines = lines[dir_idx + 1 : body_end + 1] |
| 495 | + |
| 496 | + options, content_start = _parse_options(body_lines) |
| 497 | + content_lines, content_blanks_removed = _extract_content(body_lines, content_start) |
| 498 | + content = "\n".join(content_lines) |
| 499 | + |
| 500 | + # Extra content = any non-blank line outside the directive body. |
| 501 | + has_extra = any(lines[i].strip() for i in range(body_end + 1, len(lines))) |
| 502 | + |
| 503 | + # Line offsets relative to the start of rst_text (0-based). |
| 504 | + directive_line_offset = dir_idx |
| 505 | + if content_lines: |
| 506 | + content_line_offset: int | None = ( |
| 507 | + dir_idx + 1 + content_start + content_blanks_removed |
| 508 | + ) |
| 509 | + else: |
| 510 | + content_line_offset = None |
| 511 | + |
| 512 | + return ParsedDirective( |
| 513 | + name=name, |
| 514 | + argument=argument, |
| 515 | + options=options, |
| 516 | + content=content, |
| 517 | + has_extra_content=has_extra, |
| 518 | + directive_line_offset=directive_line_offset, |
| 519 | + content_line_offset=content_line_offset, |
| 520 | + ) |
| 521 | + |
| 522 | + |
358 | 523 | # @Extract reStructuredText blocks embedded in comments, IMPL_RST_1, impl, [FE_RST_EXTRACTION] |
359 | 524 | def extract_rst( |
360 | 525 | text: str, start_marker: str, end_marker: str |
|
0 commit comments