|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +HTML Sidebar to GitHub Flavored Markdown Converter |
| 4 | +================================================== |
| 5 | +
|
| 6 | +SPDX-License-Identifier: AGPL-3.0-or-later |
| 7 | +Copyright 2025 Buo-ren Lin <buo.ren.lin@gmail.com> |
| 8 | +""" |
| 9 | + |
| 10 | +import re |
| 11 | +from bs4 import BeautifulSoup |
| 12 | + |
| 13 | + |
| 14 | +def extract_depth_from_style(style_attr): |
| 15 | + """Extract depth level from CSS --subitem-depth custom property.""" |
| 16 | + if not style_attr: |
| 17 | + return 0 |
| 18 | + |
| 19 | + # Look for --subitem-depth: N pattern |
| 20 | + match = re.search(r'--subitem-depth:\s*(\d+)', style_attr) |
| 21 | + if match: |
| 22 | + return int(match.group(1)) |
| 23 | + return 0 |
| 24 | + |
| 25 | + |
| 26 | +def extract_text_and_url_from_item(item): |
| 27 | + """Extract the text label and URL from a sidebar item.""" |
| 28 | + # Find the text content |
| 29 | + text_span = item.find('span', class_='prc-ActionList-ItemLabel-TmBhn') |
| 30 | + text = text_span.get_text(strip=True) if text_span else "Unknown" |
| 31 | + |
| 32 | + # Find URL if it's a link |
| 33 | + url = None |
| 34 | + if item.name == 'a': |
| 35 | + url = item.get('href') |
| 36 | + else: |
| 37 | + # Look for nested anchor tag |
| 38 | + anchor = item.find('a') |
| 39 | + if anchor: |
| 40 | + url = anchor.get('href') |
| 41 | + |
| 42 | + return text, url |
| 43 | + |
| 44 | + |
| 45 | +def process_li_element(li_element, depth=0): |
| 46 | + """Process a single <li> element and its children recursively.""" |
| 47 | + lines = [] |
| 48 | + |
| 49 | + # Find the main content element (either <a> or <button>) |
| 50 | + content_element = li_element.find(['a', 'button'], class_='prc-ActionList-ActionListContent-sg9-x') |
| 51 | + |
| 52 | + if content_element: |
| 53 | + # Extract text and URL |
| 54 | + text, url = extract_text_and_url_from_item(content_element) |
| 55 | + |
| 56 | + # Create markdown line with proper indentation |
| 57 | + indent = " " * depth |
| 58 | + if url: |
| 59 | + line = f"{indent}- [{text}]({url})" |
| 60 | + else: |
| 61 | + line = f"{indent}- {text}" |
| 62 | + |
| 63 | + lines.append(line) |
| 64 | + |
| 65 | + # Find child items in nested ul elements within this li |
| 66 | + child_ul = li_element.find('ul', class_='prc-ActionList-SubGroup-24eK2') |
| 67 | + if child_ul: |
| 68 | + child_lis = child_ul.find_all('li', class_='prc-ActionList-ActionListItem-uq6I7', recursive=False) |
| 69 | + for child_li in child_lis: |
| 70 | + child_lines = process_li_element(child_li, depth + 1) |
| 71 | + lines.extend(child_lines) |
| 72 | + |
| 73 | + return lines |
| 74 | + |
| 75 | + |
| 76 | +def convert_html_to_markdown(html_content): |
| 77 | + """Convert the HTML sidebar to Markdown format.""" |
| 78 | + soup = BeautifulSoup(html_content, 'html.parser') |
| 79 | + |
| 80 | + # Find the main ul container |
| 81 | + main_ul = soup.find('ul', class_='prc-ActionList-ActionList-X4RiC') |
| 82 | + if not main_ul: |
| 83 | + return "Error: Could not find main sidebar container" |
| 84 | + |
| 85 | + lines = ["# GitHub Actions Documentation"] |
| 86 | + lines.append("") |
| 87 | + |
| 88 | + # Process top-level li elements |
| 89 | + top_level_lis = main_ul.find_all('li', class_='prc-ActionList-ActionListItem-uq6I7', recursive=False) |
| 90 | + |
| 91 | + for li in top_level_lis: |
| 92 | + li_lines = process_li_element(li, 0) |
| 93 | + lines.extend(li_lines) |
| 94 | + |
| 95 | + return "\n".join(lines) |
| 96 | + |
| 97 | + |
| 98 | +def main(): |
| 99 | + """Main function to read HTML file and convert to Markdown.""" |
| 100 | + input_file = "/home/brlin/文件/TOC workspace/gh-actions-docs-sidebar.html" |
| 101 | + output_file = "/home/brlin/文件/TOC workspace/gh-actions-docs-sidebar.md" |
| 102 | + |
| 103 | + try: |
| 104 | + # Read the HTML file |
| 105 | + with open(input_file, 'r', encoding='utf-8') as f: |
| 106 | + html_content = f.read() |
| 107 | + |
| 108 | + # Convert to markdown |
| 109 | + markdown_content = convert_html_to_markdown(html_content) |
| 110 | + |
| 111 | + # Write to output file |
| 112 | + with open(output_file, 'w', encoding='utf-8') as f: |
| 113 | + f.write(markdown_content) |
| 114 | + |
| 115 | + print(f"✅ Successfully converted HTML to Markdown!") |
| 116 | + print(f"📁 Input file: {input_file}") |
| 117 | + print(f"📁 Output file: {output_file}") |
| 118 | + print(f"📊 Generated {len(markdown_content.splitlines())} lines of Markdown") |
| 119 | + |
| 120 | + except FileNotFoundError: |
| 121 | + print(f"❌ Error: Could not find input file: {input_file}") |
| 122 | + except Exception as e: |
| 123 | + print(f"❌ Error during conversion: {str(e)}") |
| 124 | + |
| 125 | + |
| 126 | +if __name__ == "__main__": |
| 127 | + main() |
0 commit comments