@@ -147,13 +147,56 @@ def cleanup_markdown(content: str) -> str:
147147 # Remove anchor tags before headings (e.g., <a id="overview"></a>)
148148 content = re .sub (r'<a id="[^"]+"></a>\s*\n?' , "" , content )
149149
150- # Reduce image widths by 50% for wiki (GitHub wiki renders larger)
151- def reduce_image_width (match ):
152- width = int (match .group (1 ))
153- new_width = max (16 , width // 2 ) # Reduce by 50%, minimum 16px
154- return f'width="{ new_width } "'
150+ # Badge URL patterns to skip (don't resize badges)
151+ badge_patterns = [
152+ r"shields\.io" ,
153+ r"badge\.svg" ,
154+ r"codecov\.io" ,
155+ r"github\.com/.+/actions/workflows/.+/badge" ,
156+ r"img\.shields\.io" ,
157+ r"coveralls\.io" ,
158+ r"travis-ci\.org" ,
159+ r"circleci\.com" ,
160+ r"appveyor\.com" ,
161+ r"readthedocs\.org" ,
162+ ]
163+
164+ def is_badge_url (url ):
165+ """Check if URL looks like a badge image."""
166+ return any (re .search (pattern , url ) for pattern in badge_patterns )
167+
168+ # Convert large images to HTML img tags with constrained width
169+ # Match: <a href="..."></a> pattern (linked images)
170+ def resize_linked_image (match ):
171+ href = match .group (1 )
172+ alt = match .group (2 )
173+ src = match .group (3 )
174+ # Skip badges - they should stay at natural size
175+ if is_badge_url (src ):
176+ return match .group (0 )
177+ return f'<a href="{ href } "><img src="{ src } " alt="{ alt } " width="120"></a>'
178+
179+ content = re .sub (
180+ r'<a href="([^"]+)">\!\[([^\]]*)\]\(([^)]+)\)</a>' ,
181+ resize_linked_image ,
182+ content ,
183+ )
184+
185+ # Match:  pattern (standalone images, not inside links like [![]()])
186+ # Skip images that are inside markdown links (preceded by [)
187+ def resize_standalone_image (match ):
188+ alt = match .group (1 )
189+ src = match .group (2 )
190+ # Skip badges - they should stay at natural size
191+ if is_badge_url (src ):
192+ return match .group (0 )
193+ return f'<img src="{ src } " alt="{ alt } " width="120">'
155194
156- content = re .sub (r'width="(\d+)"' , reduce_image_width , content )
195+ content = re .sub (
196+ r'(?<!["\(\[])\!\[([^\]]*)\]\(([^)]+)\)(?!["\)])' ,
197+ resize_standalone_image ,
198+ content ,
199+ )
157200
158201 # Fix collapsed div tags - add newlines after > and before <
159202 # Match: <div ...> content </div> and expand it
@@ -210,6 +253,168 @@ def fix_code_block(match):
210253 return content
211254
212255
256+ def cleanup_api_docs (content : str ) -> str :
257+ """Clean up API documentation for better readability.
258+
259+ Reformats dense sphinx-markdown-builder API output:
260+ - Breaks long function signatures into multiple lines
261+ - Removes escaped underscores in code contexts
262+ - Improves parameter list formatting
263+
264+ Args:
265+ content: Markdown content with API documentation.
266+
267+ Returns:
268+ Cleaned API documentation content.
269+ """
270+ # Remove escaped underscores in code/function contexts
271+ # Match: word\_word patterns and unescape them
272+ content = re .sub (r"(\w)\\_(\w)" , r"\1_\2" , content )
273+
274+ # Format long function signatures - break parameters onto separate lines
275+ def format_signature (match ):
276+ prefix = match .group (1 ) # ### module.function(
277+ params = match .group (2 ) # parameters
278+ suffix = match .group (3 ) # )
279+
280+ # If signature is short enough, keep it
281+ if len (match .group (0 )) < 80 :
282+ return match .group (0 )
283+
284+ # Parse parameters and format them
285+ # Split on ", " but be careful about nested brackets
286+ param_list = []
287+ current = ""
288+ bracket_depth = 0
289+ for char in params :
290+ if char in "([{" :
291+ bracket_depth += 1
292+ current += char
293+ elif char in ")]}" :
294+ bracket_depth -= 1
295+ current += char
296+ elif char == "," and bracket_depth == 0 :
297+ if current .strip ():
298+ param_list .append (current .strip ())
299+ current = ""
300+ else :
301+ current += char
302+ if current .strip ():
303+ param_list .append (current .strip ())
304+
305+ # If few parameters, keep on one line
306+ if len (param_list ) <= 2 :
307+ return match .group (0 )
308+
309+ # Format with line breaks
310+ formatted_params = ",\n " .join (param_list )
311+ return f"{ prefix } \n { formatted_params } \n { suffix } "
312+
313+ # Match function/method signatures: ### name(params)
314+ content = re .sub (
315+ r"(###\s+[\w.]+\()((?:[^()]+|\([^()]*\))*?)(\))" ,
316+ format_signature ,
317+ content ,
318+ )
319+
320+ # Clean up parameter descriptions - ensure proper list formatting
321+ # Match: * **param** – description that may wrap
322+ content = re .sub (
323+ r"\*\s+\*\*(\w+)\*\*\s*[–-]\s*" ,
324+ r"- **\1**: " ,
325+ content ,
326+ )
327+
328+ # Clean up "Parameters:" sections - convert to simpler format
329+ content = re .sub (
330+ r"\*\s+\*\*Parameters:\*\*" ,
331+ "\n **Parameters:**" ,
332+ content ,
333+ )
334+ content = re .sub (
335+ r"\*\s+\*\*Returns:\*\*" ,
336+ "\n **Returns:**" ,
337+ content ,
338+ )
339+ content = re .sub (
340+ r"\*\s+\*\*Raises:\*\*" ,
341+ "\n **Raises:**" ,
342+ content ,
343+ )
344+ content = re .sub (
345+ r"\*\s+\*\*Yields:\*\*" ,
346+ "\n **Yields:**" ,
347+ content ,
348+ )
349+ content = re .sub (
350+ r"\*\s+\*\*Arguments:\*\*" ,
351+ "\n **Arguments:**" ,
352+ content ,
353+ )
354+ content = re .sub (
355+ r"\*\s+\*\*Throws:\*\*" ,
356+ "\n **Throws:**" ,
357+ content ,
358+ )
359+
360+ # Fix nested list items under Parameters/Returns etc
361+ # Convert * * to proper nested -
362+ content = re .sub (r"^\s*\*\s+\*\s+" , " - " , content , flags = re .MULTILINE )
363+
364+ # Remove orphaned list markers
365+ content = re .sub (r"^\s*\*\s*$" , "" , content , flags = re .MULTILINE )
366+
367+ # Clean up type annotations in returns
368+ # Match: *type* – and convert to: (*type*)
369+ content = re .sub (
370+ r"\n\s+\*(\w+)\*\s*[–-]\s*\n" ,
371+ r"\n - *\1*: " ,
372+ content ,
373+ )
374+
375+ # Fix "#### NOTE" / "#### WARNING" etc to be more prominent
376+ content = re .sub (r"####\s+(NOTE|WARNING|SEE ALSO|IMPORTANT)" , r"> **\1**" , content )
377+
378+ return content
379+
380+
381+ def _is_api_page (filename : str , content : str ) -> bool :
382+ """Detect if a markdown file is an API documentation page.
383+
384+ Args:
385+ filename: Name of the markdown file.
386+ content: Content of the file.
387+
388+ Returns:
389+ True if this appears to be API documentation.
390+ """
391+ # Check filename patterns
392+ api_filename_patterns = [
393+ "api" ,
394+ "autoapi" ,
395+ "reference" ,
396+ ]
397+ filename_lower = filename .lower ()
398+ if any (pattern in filename_lower for pattern in api_filename_patterns ):
399+ return True
400+
401+ # Check content patterns that indicate API docs
402+ api_content_indicators = [
403+ "**Parameters:**" ,
404+ "* **Parameters:**" ,
405+ "**Returns:**" ,
406+ "* **Returns:**" ,
407+ "**Raises:**" ,
408+ "* **Raises:**" ,
409+ "**Arguments:**" ,
410+ "* **Arguments:**" ,
411+ ]
412+
413+ indicator_count = sum (1 for ind in api_content_indicators if ind in content )
414+ # If multiple API-style sections, treat as API docs
415+ return indicator_count >= 2
416+
417+
213418def extract_toctree_entries (content : str ) -> List [Tuple [str , str ]]:
214419 """Extract toctree entries from markdown content.
215420
@@ -553,6 +758,9 @@ def process_wiki_output(
553758 content = md_file .read_text (encoding = "utf-8" )
554759 # Clean up markdown formatting issues
555760 content = cleanup_markdown (content )
761+ # Clean up API documentation formatting if this looks like an API page
762+ if _is_api_page (md_file .name , content ):
763+ content = cleanup_api_docs (content )
556764 # Fix internal links for wiki
557765 fixed_content = fix_wiki_links (content , page_map )
558766 md_file .write_text (fixed_content , encoding = "utf-8" )
0 commit comments