|
13 | 13 | # limitations under the License. |
14 | 14 |
|
15 | 15 | import re |
16 | | -from typing import Optional, List, Dict |
| 16 | +from typing import Optional, Dict |
17 | 17 |
|
18 | 18 | import pypandoc # type: ignore |
19 | 19 |
|
20 | 20 | from gapic.utils.lines import wrap |
21 | 21 |
|
22 | | -# --- PERFORMANCE CACHE --- |
| 22 | +# Cache for the few complex items we actually send to pandoc |
23 | 23 | _RAW_RST_CACHE: Dict[str, str] = {} |
24 | 24 |
|
25 | | - |
26 | | -def _aggressive_fast_convert(text: str) -> Optional[str]: |
| 25 | +def _tuned_fast_convert(text: str) -> Optional[str]: |
27 | 26 | """ |
28 | | - Converts common Markdown (Code, Links, Lists) to RST using pure Python. |
29 | | - Only gives up (returns None) for complex structures like Tables. |
| 27 | + Converts Markdown to RST using pure Python. |
| 28 | + Only falls back to Pandoc for Tables and Images. |
30 | 29 | """ |
31 | | - # 1. TABLE CHECK (The only thing we strictly need Pandoc for) |
32 | | - # If we see a pipe surrounded by spaces, it's likely a table. |
33 | | - if re.search(r" \| ", text) or re.search(r"\|\n", text): |
| 30 | + # --- 1. FALLBACKS --- |
| 31 | + # Tables (pipe surrounded by spaces) or Images (![). |
| 32 | + # We allow "][" (Reference Links) to be handled by Python now. |
| 33 | + if (re.search(r" \| ", text) or re.search(r"\|\n", text)) or "![" in text: |
34 | 34 | return None |
35 | 35 |
|
36 | | - # 2. CODE BLOCKS: `code` -> ``code`` |
37 | | - # RST requires double backticks. Markdown uses one. |
38 | | - # We look for backticks that aren't already double. |
39 | | - # Regex: Negative lookbehind/lookahead to ensure we don't match ``already rst``. |
40 | | - converted = re.sub(r"(?<!`)`([^`]+)`(?!`)", r"``\1``", text) |
| 36 | + # --- 2. CONVERSION --- |
| 37 | + |
| 38 | + # A. CODE BLOCKS: `code` -> ``code`` |
| 39 | + # CRITICAL: Run this FIRST. This ensures we handle existing backticks |
| 40 | + # before we create NEW backticks for links. |
| 41 | + # (?<!:) ensures we don't break Sphinx roles like :class:`MyClass` |
| 42 | + converted = re.sub(r"(?<!:|`)`([^`]+)`(?!`)", r"``\1``", text) |
| 43 | + |
| 44 | + # B. REFERENCE LINKS: [Text][Ref] -> `Text <Ref>`__ |
| 45 | + # We fix the broken documentation by converting these to valid RST links. |
| 46 | + # Since step A is done, these new backticks will NOT be doubled. |
| 47 | + converted = re.sub(r"\[([^\]]+)\]\[([^\]]+)\]", r"`\1 <\2>`__", converted) |
41 | 48 |
|
42 | | - # 3. LINKS: [Text](URL) -> `Text <URL>`__ |
43 | | - # We use anonymous links (__) to avoid collision issues. |
| 49 | + # C. STANDARD LINKS: [Text](URL) -> `Text <URL>`__ |
44 | 50 | converted = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"`\1 <\2>`__", converted) |
45 | 51 |
|
46 | | - # 4. BOLD: **text** -> **text** (Compatible, no change needed) |
47 | | - |
48 | | - # 5. HEADINGS: # Heading -> Heading\n======= |
49 | | - # (Simple fix for H1/H2, mostly sufficient for docstrings) |
| 52 | + # D. BOLD/ITALICS: |
| 53 | + converted = re.sub(r"(?<!_)\b_([^_]+)_\b(?!_)", r"*\1*", converted) |
| 54 | + |
| 55 | + # E. HEADINGS: # Heading -> Heading\n======= |
50 | 56 | converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE) |
51 | 57 | converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE) |
52 | 58 |
|
53 | | - # 6. LISTS: Markdown lists (- item) work in RST mostly fine. |
54 | | - # We just ensure there's a newline before a list starts to satisfy RST strictness. |
55 | | - converted = re.sub(r"(\n[^-*].*)\n\s*[-*] ", r"\1\n\n- ", converted) |
| 59 | + # F. LISTS: Markdown (- item) needs a preceding newline for RST. |
| 60 | + converted = re.sub(r"(\n[^-*].*)\n\s*([-*] )", r"\1\n\n\2", converted) |
56 | 61 |
|
57 | 62 | return converted |
58 | 63 |
|
59 | | - |
60 | | -def batch_convert_docstrings(docstrings: List[str]): |
61 | | - """ |
62 | | - Optimized Batch Processor. |
63 | | - 1. Tries Aggressive Python Conversion first. |
64 | | - 2. Only sends Tables/Complex items to Pandoc. |
65 | | - """ |
66 | | - unique_docs = set(docstrings) |
67 | | - |
68 | | - # Filter: Only keep strings that need conversion and aren't in cache |
69 | | - candidates = [ |
70 | | - d for d in unique_docs |
71 | | - if d |
72 | | - and d not in _RAW_RST_CACHE |
73 | | - and re.search(r"[|*`_[\]#]", d) # Only interesting chars |
74 | | - ] |
75 | | - |
76 | | - if not candidates: |
77 | | - return |
78 | | - |
79 | | - pandoc_batch: List[str] = [] |
80 | | - |
81 | | - # 1. Try Python Conversion |
82 | | - for doc in candidates: |
83 | | - fast_result = _aggressive_fast_convert(doc) |
84 | | - if fast_result is not None: |
85 | | - # Success: Saved ~50ms per call |
86 | | - _RAW_RST_CACHE[doc] = fast_result.strip() |
87 | | - else: |
88 | | - # Failed: Must use Pandoc (Tables, etc) |
89 | | - pandoc_batch.append(doc) |
90 | | - |
91 | | - # 2. Process Remainder with Pandoc (Likely < 10 items) |
92 | | - if not pandoc_batch: |
93 | | - return |
94 | | - |
95 | | - separator = "\n\n__GAPIC_BATCH_SPLIT__\n\n" |
96 | | - giant_payload = separator.join(pandoc_batch) |
97 | | - |
98 | | - try: |
99 | | - converted_payload = pypandoc.convert_text( |
100 | | - giant_payload, |
101 | | - "rst", |
102 | | - format="commonmark", |
103 | | - extra_args=["--columns=1000"] |
104 | | - ) |
105 | | - except Exception: |
106 | | - return |
107 | | - |
108 | | - split_marker = "__GAPIC_BATCH_SPLIT__" |
109 | | - results = converted_payload.split(split_marker) |
110 | | - |
111 | | - if len(results) == len(pandoc_batch): |
112 | | - for original, converted in zip(pandoc_batch, results): |
113 | | - _RAW_RST_CACHE[original] = converted.strip() |
114 | | - |
115 | | - |
116 | 64 | def rst( |
117 | 65 | text: str, |
118 | 66 | width: int = 72, |
119 | 67 | indent: int = 0, |
120 | 68 | nl: Optional[bool] = None, |
121 | 69 | source_format: str = "commonmark", |
122 | 70 | ): |
123 | | - """Convert the given text to ReStructured Text.""" |
124 | | - |
125 | 71 | # 1. Super Fast Path: No special chars? Just wrap. |
126 | 72 | if not re.search(r"[|*`_[\]#]", text): |
127 | | - answer = wrap( |
128 | | - text, |
129 | | - indent=indent, |
130 | | - offset=indent + 3, |
131 | | - width=width - indent, |
132 | | - ) |
| 73 | + answer = wrap(text, indent=indent, offset=indent + 3, width=width - indent) |
133 | 74 | return _finalize(answer, nl, indent) |
134 | 75 |
|
135 | 76 | # 2. Check Cache |
136 | 77 | if text in _RAW_RST_CACHE: |
137 | 78 | raw_rst = _RAW_RST_CACHE[text] |
138 | 79 | else: |
139 | | - # Slow Path: Missed by batch or new string. |
140 | | - # TRY PYTHON CONVERT FIRST. |
141 | | - # This prevents the 'Slow Path' from actually being slow. |
142 | | - fast_result = _aggressive_fast_convert(text) |
| 80 | + # 3. Try Tuned Python Convert (Fastest) |
| 81 | + fast_result = _tuned_fast_convert(text) |
143 | 82 |
|
144 | 83 | if fast_result is not None: |
145 | 84 | raw_rst = fast_result.strip() |
146 | 85 | else: |
147 | | - # The absolute last resort: Shell out to Pandoc |
| 86 | + # 4. Fallback to Pandoc (Only for Tables/Images) |
148 | 87 | raw_rst = pypandoc.convert_text( |
149 | | - text, |
150 | | - "rst", |
151 | | - format=source_format, |
152 | | - extra_args=["--columns=1000"] |
| 88 | + text, "rst", format=source_format, extra_args=["--columns=1000"] |
153 | 89 | ).strip() |
154 | 90 |
|
155 | 91 | _RAW_RST_CACHE[text] = raw_rst |
156 | 92 |
|
157 | | - # 3. Python Formatting |
| 93 | + # 5. Python Formatting |
158 | 94 | if "::" in raw_rst or ".. code" in raw_rst: |
159 | 95 | answer = raw_rst.replace("\n", f"\n{' ' * indent}") |
160 | 96 | else: |
|
0 commit comments