diff --git a/apps/common/handle/impl/text/html_split_handle.py b/apps/common/handle/impl/text/html_split_handle.py
index 6ac8c44d9d9..e33c2833e3f 100644
--- a/apps/common/handle/impl/text/html_split_handle.py
+++ b/apps/common/handle/impl/text/html_split_handle.py
@@ -12,7 +12,7 @@
from bs4 import BeautifulSoup
from charset_normalizer import detect
-from html2text import html2text
+from markdownify import markdownify
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
@@ -44,6 +44,12 @@ def support(self, file, get_buffer):
return True
return False
+ def _remove_anchor_links(self, html: str) -> str:
+ soup = BeautifulSoup(html, 'html.parser')
+ for a in soup.find_all('a', href=re.compile('^#')):
+ a.unwrap()
+ return str(soup)
+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
if type(limit) is str:
@@ -57,7 +63,8 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
- content = html2text(content)
+ content = self._remove_anchor_links(content)
+ content = markdownify(content, heading_style='ATX')
except BaseException as e:
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
@@ -75,7 +82,8 @@ def get_content(self, file, save_image):
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
- return html2text(content)
+ content = self._remove_anchor_links(content)
+ return markdownify(content, heading_style='ATX')
except BaseException as e:
maxkb_logger.error(f'Exception: {e}', exc_info=True)
return f'{e}'
diff --git a/apps/common/utils/fork.py b/apps/common/utils/fork.py
index 9b172e8554f..8964dc25759 100644
--- a/apps/common/utils/fork.py
+++ b/apps/common/utils/fork.py
@@ -5,7 +5,7 @@
from typing import List, Set
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse
-import html2text as ht
+from markdownify import markdownify
import requests
from bs4 import BeautifulSoup
@@ -138,6 +138,9 @@ def reset_beautiful_soup(self, bf: BeautifulSoup):
tag_list = bf.find_all(**{field: re.compile('^(?!(http:|https:|tel:/|#|mailto:|javascript:)).*')})
for tag in tag_list:
self.reset_url(tag, field, self.base_fork_url)
+ # 去掉 href 以 # 开头的锚点链接,保留文字
+ for a in bf.find_all('a', href=re.compile('^#')):
+ a.unwrap()
return bf
@staticmethod
@@ -189,7 +192,8 @@ def fork(self):
bf = self.reset_beautiful_soup(bf)
link_list = self.get_child_link_list(bf)
content = self.get_content_html(bf)
- r = ht.html2text(content)
+
+ r = markdownify(content, heading_style='ATX')
return Fork.Response.success(r, link_list)
diff --git a/pyproject.toml b/pyproject.toml
index 23d10c97915..21a83e133b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,6 @@ dependencies = [
"django-celery-beat==2.8.1",
"celery-once==3.0.1",
"django-apscheduler==0.7.0",
- "html2text==2025.4.15",
"openpyxl==3.1.5",
"python-docx==1.2.0",
"xlrd==2.0.2",
@@ -63,7 +62,8 @@ dependencies = [
"websockets==15.0.1",
"ruff==0.15.12",
"cohere==5.17.0",
- "jsonpath-ng==1.8.0"
+ "jsonpath-ng==1.8.0",
+ "markdownify>=1.2.2",
]
[tool.uv]
@@ -88,4 +88,4 @@ torch = [
[build-system]
requires = ["hatchling"]
-build-backend = "hatchling.build"
\ No newline at end of file
+build-backend = "hatchling.build"