Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions apps/common/handle/impl/text/html_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from bs4 import BeautifulSoup
from charset_normalizer import detect
from html2text import html2text
from markdownify import markdownify

from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
Expand Down Expand Up @@ -44,6 +44,12 @@ def support(self, file, get_buffer):
return True
return False

def _remove_anchor_links(self, html: str) -> str:
soup = BeautifulSoup(html, 'html.parser')
for a in soup.find_all('a', href=re.compile('^#')):
a.unwrap()
return str(soup)

def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
if type(limit) is str:
Expand All @@ -57,7 +63,8 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
content = html2text(content)
content = self._remove_anchor_links(content)
content = markdownify(content, heading_style='ATX')
except BaseException as e:
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")

Expand All @@ -75,7 +82,8 @@ def get_content(self, file, save_image):
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
return html2text(content)
content = self._remove_anchor_links(content)
return markdownify(content, heading_style='ATX')
except BaseException as e:
maxkb_logger.error(f'Exception: {e}', exc_info=True)
return f'{e}'
8 changes: 6 additions & 2 deletions apps/common/utils/fork.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List, Set
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse

import html2text as ht
from markdownify import markdownify
import requests
from bs4 import BeautifulSoup

Expand Down Expand Up @@ -138,6 +138,9 @@ def reset_beautiful_soup(self, bf: BeautifulSoup):
tag_list = bf.find_all(**{field: re.compile('^(?!(http:|https:|tel:/|#|mailto:|javascript:)).*')})
for tag in tag_list:
self.reset_url(tag, field, self.base_fork_url)
# 去掉 href 以 # 开头的锚点链接,保留文字
for a in bf.find_all('a', href=re.compile('^#')):
a.unwrap()
return bf

@staticmethod
Expand Down Expand Up @@ -189,7 +192,8 @@ def fork(self):
bf = self.reset_beautiful_soup(bf)
link_list = self.get_child_link_list(bf)
content = self.get_content_html(bf)
r = ht.html2text(content)

r = markdownify(content, heading_style='ATX')
return Fork.Response.success(r, link_list)


Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ dependencies = [
"django-celery-beat==2.8.1",
"celery-once==3.0.1",
"django-apscheduler==0.7.0",
"html2text==2025.4.15",
"openpyxl==3.1.5",
"python-docx==1.2.0",
"xlrd==2.0.2",
Expand All @@ -63,7 +62,8 @@ dependencies = [
"websockets==15.0.1",
"ruff==0.15.12",
"cohere==5.17.0",
"jsonpath-ng==1.8.0"
"jsonpath-ng==1.8.0",
"markdownify>=1.2.2",
]

[tool.uv]
Expand All @@ -88,4 +88,4 @@ torch = [

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
build-backend = "hatchling.build"
Loading