Skip to content

Commit 9b290cb

Browse files
committed
refactor: Remove html2text and import markdownify
1 parent ed8badd commit 9b290cb

3 files changed

Lines changed: 20 additions & 8 deletions

File tree

apps/common/handle/impl/text/html_split_handle.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from bs4 import BeautifulSoup
1414
from charset_normalizer import detect
15-
from html2text import html2text
15+
from markdownify import markdownify
1616

1717
from common.handle.base_split_handle import BaseSplitHandle
1818
from common.utils.logger import maxkb_logger
@@ -44,6 +44,12 @@ def support(self, file, get_buffer):
4444
return True
4545
return False
4646

47+
def _remove_anchor_links(self, html: str) -> str:
48+
soup = BeautifulSoup(html, 'html.parser')
49+
for a in soup.find_all('a', href=re.compile('^#')):
50+
a.unwrap()
51+
return str(soup)
52+
4753
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
4854
buffer = get_buffer(file)
4955
if type(limit) is str:
@@ -57,7 +63,8 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
5763
try:
5864
encoding = get_encoding(buffer)
5965
content = buffer.decode(encoding)
60-
content = html2text(content)
66+
content = self._remove_anchor_links(content)
67+
content = markdownify(content, heading_style='ATX')
6168
except BaseException as e:
6269
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
6370

@@ -75,7 +82,8 @@ def get_content(self, file, save_image):
7582
try:
7683
encoding = get_encoding(buffer)
7784
content = buffer.decode(encoding)
78-
return html2text(content)
85+
content = self._remove_anchor_links(content)
86+
return markdownify(content, heading_style='ATX')
7987
except BaseException as e:
8088
maxkb_logger.error(f'Exception: {e}', exc_info=True)
8189
return f'{e}'

apps/common/utils/fork.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import List, Set
66
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse
77

8-
import html2text as ht
8+
from markdownify import markdownify
99
import requests
1010
from bs4 import BeautifulSoup
1111

@@ -138,6 +138,9 @@ def reset_beautiful_soup(self, bf: BeautifulSoup):
138138
tag_list = bf.find_all(**{field: re.compile('^(?!(http:|https:|tel:/|#|mailto:|javascript:)).*')})
139139
for tag in tag_list:
140140
self.reset_url(tag, field, self.base_fork_url)
141+
# 去掉 href 以 # 开头的锚点链接,保留文字
142+
for a in bf.find_all('a', href=re.compile('^#')):
143+
a.unwrap()
141144
return bf
142145

143146
@staticmethod
@@ -189,7 +192,8 @@ def fork(self):
189192
bf = self.reset_beautiful_soup(bf)
190193
link_list = self.get_child_link_list(bf)
191194
content = self.get_content_html(bf)
192-
r = ht.html2text(content)
195+
196+
r = markdownify(content, heading_style='ATX')
193197
return Fork.Response.success(r, link_list)
194198

195199

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ dependencies = [
4949
"django-celery-beat==2.8.1",
5050
"celery-once==3.0.1",
5151
"django-apscheduler==0.7.0",
52-
"html2text==2025.4.15",
5352
"openpyxl==3.1.5",
5453
"python-docx==1.2.0",
5554
"xlrd==2.0.2",
@@ -63,7 +62,8 @@ dependencies = [
6362
"websockets==15.0.1",
6463
"ruff==0.15.12",
6564
"cohere==5.17.0",
66-
"jsonpath-ng==1.8.0"
65+
"jsonpath-ng==1.8.0",
66+
"markdownify>=1.2.2",
6767
]
6868

6969
[tool.uv]
@@ -88,4 +88,4 @@ torch = [
8888

8989
[build-system]
9090
requires = ["hatchling"]
91-
build-backend = "hatchling.build"
91+
build-backend = "hatchling.build"

0 commit comments

Comments
 (0)