|
5 | 5 | from typing import Optional |
6 | 6 | from bs4 import BeautifulSoup |
7 | 7 |
|
| 8 | +ImgRules = list[dict] |
| 9 | + |
8 | 10 |
|
9 | 11 | def encode_md_link(path_str: str) -> str: |
10 | 12 | """ |
@@ -72,49 +74,56 @@ def process_gfm(content: str) -> str: |
72 | 74 | return content |
73 | 75 |
|
74 | 76 |
|
75 | | -# def remove_mainpage_title(content: str, filename: str) -> str: |
76 | | -# if filename != "index.html": |
77 | | -# return content |
| 77 | +def process_heading_code_blocks(content: str) -> str: |
| 78 | + content = content.replace("<tt>", "<code>") |
| 79 | + content = content.replace("</tt>", "</code>") |
| 80 | + return content |
| 81 | + |
78 | 82 |
|
79 | | -# soup = BeautifulSoup(content, 'html.parser') |
80 | | -# header_div = soup.find("div", class_="header") |
81 | | -# if header_div: |
82 | | -# header_div.decompose() |
83 | | -# return str(soup) |
| 83 | +def remove_mainpage_title(content: str, filename: str) -> str: |
| 84 | + if filename != "index.html": |
| 85 | + return content |
84 | 86 |
|
| 87 | + return re.sub(r'<div class="header">\s*<div class="headertitle">.*?</div>\s*</div>', '', content, flags=re.DOTALL) |
85 | 88 |
|
86 | | -# ImgRules = list[dict] |
87 | 89 |
|
88 | | -# def process_images(content: str, html_path: Path, rules: ImgRules) -> str: |
89 | | -# soup = BeautifulSoup(content, 'html.parser') |
| 90 | +def process_images(content: str, html_path: Path, rules: list[dict]) -> str: |
| 91 | + def replacer(match): |
| 92 | + img_tag = match.group(0) |
| 93 | + src = match.group(1) |
| 94 | + filename = Path(src).name |
| 95 | + |
| 96 | + # Skip external or missing files |
| 97 | + if src.startswith(('http', 'data:')) or not (html_path.parent / filename).exists(): |
| 98 | + return img_tag |
90 | 99 |
|
91 | | -# for img in soup.find_all('img'): |
92 | | -# src = img.get('src') |
93 | | -# if not src: |
94 | | -# continue |
| 100 | + # Fix the src path |
| 101 | + img_tag = img_tag.replace(f'src="{src}"', f'src="{filename}"') |
95 | 102 |
|
96 | | -# if src.startswith(('http://', 'https://', 'data:')): |
97 | | -# continue |
| 103 | + # Apply style rules |
| 104 | + for rule in rules: |
| 105 | + if rule.get("filename") == filename or f'alt="{rule.get("alt")}"' in img_tag: |
| 106 | + style = rule.get("style", "").strip('; ') |
98 | 107 |
|
99 | | -# filename = Path(src).name |
100 | | -# if (html_path.parent / filename).exists(): |
101 | | -# img['src'] = filename # Align the file path |
| 108 | + if 'style="' in img_tag: # Append to existing style |
| 109 | + img_tag = re.sub(r'style="([^"]*)"', lambda m: f'style="{m.group(1).strip("; ")}; {style}"', img_tag) |
| 110 | + else: # Inject new style |
| 111 | + img_tag = img_tag.replace('/>', f' style="{style}" />').replace('">', f' style="{style}">') |
102 | 112 |
|
103 | | -# alt = img.get('alt', '') |
104 | | -# for rule in rules: # Apply image style rules |
105 | | -# if ("filaneme" in rule and filename == rule["filename"]) or ("alt" in rule and alt == rule["alt"]): |
106 | | -# img['style'] = f"{img.get('style', '')}; {rule.get('style', '')}".strip('; ') |
| 113 | + return img_tag |
107 | 114 |
|
108 | | -# return str(soup) |
| 115 | + # Match <img> tags and capture their src attribute |
| 116 | + return re.sub(r'<img[^>]*src="([^"]+)"[^>]*>', replacer, content, flags=re.IGNORECASE) |
109 | 117 |
|
110 | 118 |
|
111 | 119 | def process_file(f: Path, img_rules: Optional[ImgRules] = None): |
112 | 120 | content = f.read_text(encoding='utf-8') |
113 | 121 | content = process_md_refs(content) |
114 | 122 | content = process_gfm(content) |
115 | | - # content = remove_mainpage_title(content, f.name) |
116 | | - # if img_rules: |
117 | | - # content = process_images(content, f, img_rules) |
| 123 | + content = process_heading_code_blocks(content) |
| 124 | + content = remove_mainpage_title(content, f.name) |
| 125 | + if img_rules: |
| 126 | + content = process_images(content, f, img_rules) |
118 | 127 |
|
119 | 128 | f.write_text(content, encoding='utf-8') |
120 | 129 |
|
|
0 commit comments