refactor: Remove html2text and import markdownify

zhanweizhang7 · zhanweizhang7 · commit 9b290cb88816 · 2026-04-27T16:03:23.000+08:00
diff --git a/apps/common/handle/impl/text/html_split_handle.py b/apps/common/handle/impl/text/html_split_handle.py
@@ -12,7 +12,7 @@
 
 from bs4 import BeautifulSoup
 from charset_normalizer import detect
-from html2text import html2text
+from markdownify import markdownify
 
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
@@ -44,6 +44,12 @@ def support(self, file, get_buffer):
             return True
         return False
 
+    def _remove_anchor_links(self, html: str) -> str:
+        soup = BeautifulSoup(html, 'html.parser')
+        for a in soup.find_all('a', href=re.compile('^#')):
+            a.unwrap()
+        return str(soup)
+
     def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
         buffer = get_buffer(file)
         if type(limit) is str:
@@ -57,7 +63,8 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
         try:
             encoding = get_encoding(buffer)
             content = buffer.decode(encoding)
-            content = html2text(content)
+            content = self._remove_anchor_links(content)
+            content = markdownify(content, heading_style='ATX')
         except BaseException as e:
             maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
 
@@ -75,7 +82,8 @@ def get_content(self, file, save_image):
         try:
             encoding = get_encoding(buffer)
             content = buffer.decode(encoding)
-            return html2text(content)
+            content = self._remove_anchor_links(content)
+            return markdownify(content, heading_style='ATX')
         except BaseException as e:
             maxkb_logger.error(f'Exception: {e}', exc_info=True)
             return f'{e}'
diff --git a/apps/common/utils/fork.py b/apps/common/utils/fork.py
@@ -5,7 +5,7 @@
 from typing import List, Set
 from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse
 
-import html2text as ht
+from markdownify import markdownify
 import requests
 from bs4 import BeautifulSoup
 
@@ -138,6 +138,9 @@ def reset_beautiful_soup(self, bf: BeautifulSoup):
             tag_list = bf.find_all(**{field: re.compile('^(?!(http:|https:|tel:/|#|mailto:|javascript:)).*')})
             for tag in tag_list:
                 self.reset_url(tag, field, self.base_fork_url)
+            # 去掉 href 以 # 开头的锚点链接，保留文字
+        for a in bf.find_all('a', href=re.compile('^#')):
+            a.unwrap()
         return bf
 
     @staticmethod
@@ -189,7 +192,8 @@ def fork(self):
         bf = self.reset_beautiful_soup(bf)
         link_list = self.get_child_link_list(bf)
         content = self.get_content_html(bf)
-        r = ht.html2text(content)
+
+        r = markdownify(content, heading_style='ATX')
         return Fork.Response.success(r, link_list)
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,6 @@ dependencies = [
     "django-celery-beat==2.8.1",
     "celery-once==3.0.1",
     "django-apscheduler==0.7.0",
-    "html2text==2025.4.15",
     "openpyxl==3.1.5",
     "python-docx==1.2.0",
     "xlrd==2.0.2",
@@ -63,7 +62,8 @@ dependencies = [
     "websockets==15.0.1",
     "ruff==0.15.12",
     "cohere==5.17.0",
-    "jsonpath-ng==1.8.0"
+    "jsonpath-ng==1.8.0",
+    "markdownify>=1.2.2",
 ]
 
 [tool.uv]
@@ -88,4 +88,4 @@ torch = [
 
 [build-system]
 requires = ["hatchling"]
-build-backend = "hatchling.build"
+build-backend = "hatchling.build"