32 lines
732 B
Python
32 lines
732 B
Python
# 读取html文件
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def extract_text_html(file_path):
|
|
"""
|
|
从指定HTML文件中提取所有文本内容
|
|
|
|
参数:
|
|
file_path (str): HTML文件路径
|
|
|
|
返回:
|
|
str: 提取的文本内容
|
|
|
|
"""
|
|
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
# 读取整个html文件内容字符串
|
|
html = f.read()
|
|
# 使用BeautifulSoup解析html内容
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
# 提取所有文本内容,使用换行符分割
|
|
text = soup.get_text(separator="\n")
|
|
return text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
file_path = "example/example.html"
|
|
result = extract_text_html(file_path)
|
|
print(result)
|