03Rag/05html.py

# 读取html文件

from bs4 import BeautifulSoup


def extract_text_html(file_path):
    """
    从指定HTML文件中提取所有文本内容

    参数:
        file_path (str): HTML文件路径

    返回:
        str: 提取的文本内容

    """

    with open(file_path, "r", encoding="utf-8") as f:
        # 读取整个html文件内容字符串
        html = f.read()
        # 使用BeautifulSoup解析html内容
        soup = BeautifulSoup(html, "html.parser")
        # 提取所有文本内容，使用换行符分割
        text = soup.get_text(separator="\n")
        return text


if __name__ == "__main__":
    file_path = "example/example.html"
    result = extract_text_html(file_path)
    print(result)