feat: rag

2026-05-06 11:35:10 +08:00
commit a17c65c4bc
75 changed files with 5196 additions and 0 deletions
@@ -0,0 +1,31 @@
+# 读取html文件
+
+from bs4 import BeautifulSoup
+
+
+def extract_text_html(file_path):
+    """
+    从指定HTML文件中提取所有文本内容
+
+    参数:
+        file_path (str): HTML文件路径
+
+    返回:
+        str: 提取的文本内容
+
+    """
+
+    with open(file_path, "r", encoding="utf-8") as f:
+        # 读取整个html文件内容字符串
+        html = f.read()
+        # 使用BeautifulSoup解析html内容
+        soup = BeautifulSoup(html, "html.parser")
+        # 提取所有文本内容，使用换行符分割
+        text = soup.get_text(separator="\n")
+        return text
+
+
+if __name__ == "__main__":
+    file_path = "example/example.html"
+    result = extract_text_html(file_path)
+    print(result)