feat: rag

This commit is contained in:
heyong.fu
2026-05-06 11:35:10 +08:00
commit a17c65c4bc
75 changed files with 5196 additions and 0 deletions
+31
View File
@@ -0,0 +1,31 @@
# 读取html文件
from bs4 import BeautifulSoup
def extract_text_html(file_path):
"""
从指定HTML文件中提取所有文本内容
参数:
file_path (str): HTML文件路径
返回:
str: 提取的文本内容
"""
with open(file_path, "r", encoding="utf-8") as f:
# 读取整个html文件内容字符串
html = f.read()
# 使用BeautifulSoup解析html内容
soup = BeautifulSoup(html, "html.parser")
# 提取所有文本内容,使用换行符分割
text = soup.get_text(separator="\n")
return text
if __name__ == "__main__":
file_path = "example/example.html"
result = extract_text_html(file_path)
print(result)