55from typing import Optional
66from bs4 import BeautifulSoup
77
8+ ImgRules = list [dict ]
9+
810
911def encode_md_link (path_str : str ) -> str :
1012 """
@@ -24,6 +26,10 @@ def encode_md_link(path_str: str) -> str:
2426 filepath = filepath .removeprefix ('/' )
2527 if not filepath .endswith ('.md' ):
2628 return path_str # Not a .md file
29+
30+ if filepath == "README.md" :
31+ return f"index.html{ anchor } "
32+
2733 filepath = filepath .removesuffix ('.md' )
2834
2935 # Encode path separators and underscores
@@ -72,49 +78,56 @@ def process_gfm(content: str) -> str:
7278 return content
7379
7480
75- # def remove_mainpage_title(content: str, filename: str) -> str:
76- # if filename != "index.html":
77- # return content
81+ def process_heading_code_blocks (content : str ) -> str :
82+ content = content .replace ("<tt>" , "<code>" )
83+ content = content .replace ("</tt>" , "</code>" )
84+ return content
7885
79- # soup = BeautifulSoup(content, 'html.parser')
80- # header_div = soup.find("div", class_="header")
81- # if header_div:
82- # header_div.decompose()
83- # return str(soup)
8486
87+ def remove_mainpage_title (content : str , filename : str ) -> str :
88+ if filename != "index.html" :
89+ return content
8590
86- # ImgRules = list[dict]
91+ return re .sub (r'<div class="header">\s*<div class="headertitle">.*?</div>\s*</div>' , '' , content , flags = re .DOTALL )
92+
93+
94+ def process_images (content : str , html_path : Path , rules : list [dict ]) -> str :
95+ def replacer (match ):
96+ img_tag = match .group (0 )
97+ src = match .group (1 )
98+ filename = Path (src ).name
8799
88- # def process_images(content: str, html_path: Path, rules: ImgRules) -> str:
89- # soup = BeautifulSoup(content, 'html.parser')
100+ # Skip external or missing files
101+ if src .startswith (('http' , 'data:' )) or not (html_path .parent / filename ).exists ():
102+ return img_tag
90103
91- # for img in soup.find_all('img'):
92- # src = img.get('src')
93- # if not src:
94- # continue
104+ # Fix the src path
105+ img_tag = img_tag .replace (f'src="{ src } "' , f'src="{ filename } "' )
95106
96- # if src.startswith(('http://', 'https://', 'data:')):
97- # continue
107+ # Apply style rules
108+ for rule in rules :
109+ if rule .get ("filename" ) == filename or f'alt="{ rule .get ("alt" )} "' in img_tag :
110+ style = rule .get ("style" , "" ).strip ('; ' )
98111
99- # filename = Path(src).name
100- # if (html_path.parent / filename).exists():
101- # img['src'] = filename # Align the file path
112+ if 'style="' in img_tag : # Append to existing style
113+ img_tag = re .sub (r'style="([^"]*)"' , lambda m : f'style="{ m .group (1 ).strip ("; " )} ; { style } "' , img_tag )
114+ else : # Inject new style
115+ img_tag = img_tag .replace ('/>' , f' style="{ style } " />' ).replace ('">' , f' style="{ style } ">' )
102116
103- # alt = img.get('alt', '')
104- # for rule in rules: # Apply image style rules
105- # if ("filaneme" in rule and filename == rule["filename"]) or ("alt" in rule and alt == rule["alt"]):
106- # img['style'] = f"{img.get('style', '')}; {rule.get('style', '')}".strip('; ')
117+ return img_tag
107118
108- # return str(soup)
119+ # Match <img> tags and capture their src attribute
120+ return re .sub (r'<img[^>]*src="([^"]+)"[^>]*>' , replacer , content , flags = re .IGNORECASE )
109121
110122
111123def process_file (f : Path , img_rules : Optional [ImgRules ] = None ):
112124 content = f .read_text (encoding = 'utf-8' )
113125 content = process_md_refs (content )
114126 content = process_gfm (content )
115- # content = remove_mainpage_title(content, f.name)
116- # if img_rules:
117- # content = process_images(content, f, img_rules)
127+ content = process_heading_code_blocks (content )
128+ content = remove_mainpage_title (content , f .name )
129+ if img_rules :
130+ content = process_images (content , f , img_rules )
118131
119132 f .write_text (content , encoding = 'utf-8' )
120133
0 commit comments