feat: rag

This commit is contained in:
heyong.fu
2026-05-06 11:35:10 +08:00
commit a17c65c4bc
75 changed files with 5196 additions and 0 deletions
+30
View File
@@ -0,0 +1,30 @@
# 读取xml文件格式
from lxml import etree
def extract_xml_text(file_path):
"""
读取XML文件并提取所有文本内容
参数:
file_path (str): XML文件路径
返回:
str: 提取的所有文本内容
"""
# 以utf-8格式打开文件
with open(file_path, "r", encoding="utf-8") as f:
# 读取xml文件的全部字符串
xml = f.read()
# 将字符串形式的xml内容解析为xms树结构
root = etree.fromstring(xml.encode("utf-8"))
# 遍历xml树,提取所有文本内容,并用空格链接
text = " ".join(root.itertext())
return text
if __name__ == "__main__":
file_path = "example/example.xml"
result = extract_xml_text(file_path)
print(result)