From 9b290cb888167f19765441e0bc6314e25cf912ce Mon Sep 17 00:00:00 2001 From: zhangzhanwei Date: Mon, 27 Apr 2026 16:03:23 +0800 Subject: [PATCH] refactor: Remove html2text and import markdownify --- apps/common/handle/impl/text/html_split_handle.py | 14 +++++++++++--- apps/common/utils/fork.py | 8 ++++++-- pyproject.toml | 6 +++--- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/apps/common/handle/impl/text/html_split_handle.py b/apps/common/handle/impl/text/html_split_handle.py index 6ac8c44d9d9..e33c2833e3f 100644 --- a/apps/common/handle/impl/text/html_split_handle.py +++ b/apps/common/handle/impl/text/html_split_handle.py @@ -12,7 +12,7 @@ from bs4 import BeautifulSoup from charset_normalizer import detect -from html2text import html2text +from markdownify import markdownify from common.handle.base_split_handle import BaseSplitHandle from common.utils.logger import maxkb_logger @@ -44,6 +44,12 @@ def support(self, file, get_buffer): return True return False + def _remove_anchor_links(self, html: str) -> str: + soup = BeautifulSoup(html, 'html.parser') + for a in soup.find_all('a', href=re.compile('^#')): + a.unwrap() + return str(soup) + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): buffer = get_buffer(file) if type(limit) is str: @@ -57,7 +63,8 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu try: encoding = get_encoding(buffer) content = buffer.decode(encoding) - content = html2text(content) + content = self._remove_anchor_links(content) + content = markdownify(content, heading_style='ATX') except BaseException as e: maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}") @@ -75,7 +82,8 @@ def get_content(self, file, save_image): try: encoding = get_encoding(buffer) content = buffer.decode(encoding) - return html2text(content) + content = self._remove_anchor_links(content) + return markdownify(content, heading_style='ATX') except BaseException as e: maxkb_logger.error(f'Exception: {e}', exc_info=True) return f'{e}' diff --git a/apps/common/utils/fork.py b/apps/common/utils/fork.py index 9b172e8554f..8964dc25759 100644 --- a/apps/common/utils/fork.py +++ b/apps/common/utils/fork.py @@ -5,7 +5,7 @@ from typing import List, Set from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse -import html2text as ht +from markdownify import markdownify import requests from bs4 import BeautifulSoup @@ -138,6 +138,9 @@ def reset_beautiful_soup(self, bf: BeautifulSoup): tag_list = bf.find_all(**{field: re.compile('^(?!(http:|https:|tel:/|#|mailto:|javascript:)).*')}) for tag in tag_list: self.reset_url(tag, field, self.base_fork_url) + # 去掉 href 以 # 开头的锚点链接,保留文字 + for a in bf.find_all('a', href=re.compile('^#')): + a.unwrap() return bf @staticmethod @@ -189,7 +192,8 @@ def fork(self): bf = self.reset_beautiful_soup(bf) link_list = self.get_child_link_list(bf) content = self.get_content_html(bf) - r = ht.html2text(content) + + r = markdownify(content, heading_style='ATX') return Fork.Response.success(r, link_list) diff --git a/pyproject.toml b/pyproject.toml index 23d10c97915..21a83e133b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,6 @@ dependencies = [ "django-celery-beat==2.8.1", "celery-once==3.0.1", "django-apscheduler==0.7.0", - "html2text==2025.4.15", "openpyxl==3.1.5", "python-docx==1.2.0", "xlrd==2.0.2", @@ -63,7 +62,8 @@ dependencies = [ "websockets==15.0.1", "ruff==0.15.12", "cohere==5.17.0", - "jsonpath-ng==1.8.0" + "jsonpath-ng==1.8.0", + "markdownify>=1.2.2", ] [tool.uv] @@ -88,4 +88,4 @@ torch = [ [build-system] requires = ["hatchling"] -build-backend = "hatchling.build" \ No newline at end of file +build-backend = "hatchling.build"