feat: rag
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
# 读取xml文件格式
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def extract_xml_text(file_path):
|
||||
"""
|
||||
读取XML文件并提取所有文本内容
|
||||
|
||||
参数:
|
||||
file_path (str): XML文件路径
|
||||
|
||||
返回:
|
||||
str: 提取的所有文本内容
|
||||
"""
|
||||
# 以utf-8格式打开文件
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
# 读取xml文件的全部字符串
|
||||
xml = f.read()
|
||||
# 将字符串形式的xml内容解析为xms树结构
|
||||
root = etree.fromstring(xml.encode("utf-8"))
|
||||
# 遍历xml树,提取所有文本内容,并用空格链接
|
||||
text = " ".join(root.itertext())
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = "example/example.xml"
|
||||
result = extract_xml_text(file_path)
|
||||
print(result)
|
||||
Reference in New Issue
Block a user