Add HTML to Markdown converter script

GitHub Copilot · brlin-tw · commit 1919fed025f5 · 2025-10-13T01:18:57.000+08:00
Added html_to_markdown_converter.py to convert GitHub Actions documentation sidebar HTML to GitHub Flavored Markdown format. The script uses BeautifulSoup4 to parse HTML structure and generates properly formatted unordered lists with preserved hierarchy and links.
diff --git a/html_to_markdown_converter.py b/html_to_markdown_converter.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""
+HTML Sidebar to GitHub Flavored Markdown Converter
+==================================================
+
+SPDX-License-Identifier: AGPL-3.0-or-later
+Copyright 2025 Buo-ren Lin <buo.ren.lin@gmail.com>
+"""
+
+import re
+from bs4 import BeautifulSoup
+
+
+def extract_depth_from_style(style_attr):
+    """Extract depth level from CSS --subitem-depth custom property."""
+    if not style_attr:
+        return 0
+
+    # Look for --subitem-depth: N pattern
+    match = re.search(r'--subitem-depth:\s*(\d+)', style_attr)
+    if match:
+        return int(match.group(1))
+    return 0
+
+
+def extract_text_and_url_from_item(item):
+    """Extract the text label and URL from a sidebar item."""
+    # Find the text content
+    text_span = item.find('span', class_='prc-ActionList-ItemLabel-TmBhn')
+    text = text_span.get_text(strip=True) if text_span else "Unknown"
+
+    # Find URL if it's a link
+    url = None
+    if item.name == 'a':
+        url = item.get('href')
+    else:
+        # Look for nested anchor tag
+        anchor = item.find('a')
+        if anchor:
+            url = anchor.get('href')
+
+    return text, url
+
+
+def process_li_element(li_element, depth=0):
+    """Process a single <li> element and its children recursively."""
+    lines = []
+
+    # Find the main content element (either <a> or <button>)
+    content_element = li_element.find(['a', 'button'], class_='prc-ActionList-ActionListContent-sg9-x')
+
+    if content_element:
+        # Extract text and URL
+        text, url = extract_text_and_url_from_item(content_element)
+
+        # Create markdown line with proper indentation
+        indent = "  " * depth
+        if url:
+            line = f"{indent}- [{text}]({url})"
+        else:
+            line = f"{indent}- {text}"
+
+        lines.append(line)
+
+        # Find child items in nested ul elements within this li
+        child_ul = li_element.find('ul', class_='prc-ActionList-SubGroup-24eK2')
+        if child_ul:
+            child_lis = child_ul.find_all('li', class_='prc-ActionList-ActionListItem-uq6I7', recursive=False)
+            for child_li in child_lis:
+                child_lines = process_li_element(child_li, depth + 1)
+                lines.extend(child_lines)
+
+    return lines
+
+
+def convert_html_to_markdown(html_content):
+    """Convert the HTML sidebar to Markdown format."""
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    # Find the main ul container
+    main_ul = soup.find('ul', class_='prc-ActionList-ActionList-X4RiC')
+    if not main_ul:
+        return "Error: Could not find main sidebar container"
+
+    lines = ["# GitHub Actions Documentation"]
+    lines.append("")
+
+    # Process top-level li elements
+    top_level_lis = main_ul.find_all('li', class_='prc-ActionList-ActionListItem-uq6I7', recursive=False)
+
+    for li in top_level_lis:
+        li_lines = process_li_element(li, 0)
+        lines.extend(li_lines)
+
+    return "\n".join(lines)
+
+
+def main():
+    """Main function to read HTML file and convert to Markdown."""
+    input_file = "/home/brlin/文件/TOC workspace/gh-actions-docs-sidebar.html"
+    output_file = "/home/brlin/文件/TOC workspace/gh-actions-docs-sidebar.md"
+
+    try:
+        # Read the HTML file
+        with open(input_file, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+
+        # Convert to markdown
+        markdown_content = convert_html_to_markdown(html_content)
+
+        # Write to output file
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(markdown_content)
+
+        print(f"✅ Successfully converted HTML to Markdown!")
+        print(f"📁 Input file: {input_file}")
+        print(f"📁 Output file: {output_file}")
+        print(f"📊 Generated {len(markdown_content.splitlines())} lines of Markdown")
+
+    except FileNotFoundError:
+        print(f"❌ Error: Could not find input file: {input_file}")
+    except Exception as e:
+        print(f"❌ Error during conversion: {str(e)}")
+
+
+if __name__ == "__main__":
+    main()