Files
03Rag/05html.py
T
heyong.fu a17c65c4bc feat: rag
2026-05-06 11:35:10 +08:00

32 lines
732 B
Python

# 读取html文件
from bs4 import BeautifulSoup
def extract_text_html(file_path):
"""
从指定HTML文件中提取所有文本内容
参数:
file_path (str): HTML文件路径
返回:
str: 提取的文本内容
"""
with open(file_path, "r", encoding="utf-8") as f:
# 读取整个html文件内容字符串
html = f.read()
# 使用BeautifulSoup解析html内容
soup = BeautifulSoup(html, "html.parser")
# 提取所有文本内容,使用换行符分割
text = soup.get_text(separator="\n")
return text
if __name__ == "__main__":
file_path = "example/example.html"
result = extract_text_html(file_path)
print(result)