Skip to content

Commit 1919fed

Browse files
GitHub Copilotbrlin-tw
authored andcommitted
Add HTML to Markdown converter script
Added html_to_markdown_converter.py to convert GitHub Actions documentation sidebar HTML to GitHub Flavored Markdown format. The script uses BeautifulSoup4 to parse HTML structure and generates properly formatted unordered lists with preserved hierarchy and links.
1 parent 9955985 commit 1919fed

1 file changed

Lines changed: 127 additions & 0 deletions

File tree

html_to_markdown_converter.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#!/usr/bin/env python3
2+
"""
3+
HTML Sidebar to GitHub Flavored Markdown Converter
4+
==================================================
5+
6+
SPDX-License-Identifier: AGPL-3.0-or-later
7+
Copyright 2025 Buo-ren Lin <buo.ren.lin@gmail.com>
8+
"""
9+
10+
import re
11+
from bs4 import BeautifulSoup
12+
13+
14+
def extract_depth_from_style(style_attr):
15+
"""Extract depth level from CSS --subitem-depth custom property."""
16+
if not style_attr:
17+
return 0
18+
19+
# Look for --subitem-depth: N pattern
20+
match = re.search(r'--subitem-depth:\s*(\d+)', style_attr)
21+
if match:
22+
return int(match.group(1))
23+
return 0
24+
25+
26+
def extract_text_and_url_from_item(item):
27+
"""Extract the text label and URL from a sidebar item."""
28+
# Find the text content
29+
text_span = item.find('span', class_='prc-ActionList-ItemLabel-TmBhn')
30+
text = text_span.get_text(strip=True) if text_span else "Unknown"
31+
32+
# Find URL if it's a link
33+
url = None
34+
if item.name == 'a':
35+
url = item.get('href')
36+
else:
37+
# Look for nested anchor tag
38+
anchor = item.find('a')
39+
if anchor:
40+
url = anchor.get('href')
41+
42+
return text, url
43+
44+
45+
def process_li_element(li_element, depth=0):
46+
"""Process a single <li> element and its children recursively."""
47+
lines = []
48+
49+
# Find the main content element (either <a> or <button>)
50+
content_element = li_element.find(['a', 'button'], class_='prc-ActionList-ActionListContent-sg9-x')
51+
52+
if content_element:
53+
# Extract text and URL
54+
text, url = extract_text_and_url_from_item(content_element)
55+
56+
# Create markdown line with proper indentation
57+
indent = " " * depth
58+
if url:
59+
line = f"{indent}- [{text}]({url})"
60+
else:
61+
line = f"{indent}- {text}"
62+
63+
lines.append(line)
64+
65+
# Find child items in nested ul elements within this li
66+
child_ul = li_element.find('ul', class_='prc-ActionList-SubGroup-24eK2')
67+
if child_ul:
68+
child_lis = child_ul.find_all('li', class_='prc-ActionList-ActionListItem-uq6I7', recursive=False)
69+
for child_li in child_lis:
70+
child_lines = process_li_element(child_li, depth + 1)
71+
lines.extend(child_lines)
72+
73+
return lines
74+
75+
76+
def convert_html_to_markdown(html_content):
77+
"""Convert the HTML sidebar to Markdown format."""
78+
soup = BeautifulSoup(html_content, 'html.parser')
79+
80+
# Find the main ul container
81+
main_ul = soup.find('ul', class_='prc-ActionList-ActionList-X4RiC')
82+
if not main_ul:
83+
return "Error: Could not find main sidebar container"
84+
85+
lines = ["# GitHub Actions Documentation"]
86+
lines.append("")
87+
88+
# Process top-level li elements
89+
top_level_lis = main_ul.find_all('li', class_='prc-ActionList-ActionListItem-uq6I7', recursive=False)
90+
91+
for li in top_level_lis:
92+
li_lines = process_li_element(li, 0)
93+
lines.extend(li_lines)
94+
95+
return "\n".join(lines)
96+
97+
98+
def main():
99+
"""Main function to read HTML file and convert to Markdown."""
100+
input_file = "/home/brlin/文件/TOC workspace/gh-actions-docs-sidebar.html"
101+
output_file = "/home/brlin/文件/TOC workspace/gh-actions-docs-sidebar.md"
102+
103+
try:
104+
# Read the HTML file
105+
with open(input_file, 'r', encoding='utf-8') as f:
106+
html_content = f.read()
107+
108+
# Convert to markdown
109+
markdown_content = convert_html_to_markdown(html_content)
110+
111+
# Write to output file
112+
with open(output_file, 'w', encoding='utf-8') as f:
113+
f.write(markdown_content)
114+
115+
print(f"✅ Successfully converted HTML to Markdown!")
116+
print(f"📁 Input file: {input_file}")
117+
print(f"📁 Output file: {output_file}")
118+
print(f"📊 Generated {len(markdown_content.splitlines())} lines of Markdown")
119+
120+
except FileNotFoundError:
121+
print(f"❌ Error: Could not find input file: {input_file}")
122+
except Exception as e:
123+
print(f"❌ Error during conversion: {str(e)}")
124+
125+
126+
if __name__ == "__main__":
127+
main()

0 commit comments

Comments
 (0)