feat: rag
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
# 读取html文件
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def extract_text_html(file_path):
|
||||
"""
|
||||
从指定HTML文件中提取所有文本内容
|
||||
|
||||
参数:
|
||||
file_path (str): HTML文件路径
|
||||
|
||||
返回:
|
||||
str: 提取的文本内容
|
||||
|
||||
"""
|
||||
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
# 读取整个html文件内容字符串
|
||||
html = f.read()
|
||||
# 使用BeautifulSoup解析html内容
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# 提取所有文本内容,使用换行符分割
|
||||
text = soup.get_text(separator="\n")
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = "example/example.html"
|
||||
result = extract_text_html(file_path)
|
||||
print(result)
|
||||
Reference in New Issue
Block a user