feat: rag
This commit is contained in:
@@ -0,0 +1,33 @@
|
||||
# 读取ppt文件
|
||||
from pptx import Presentation
|
||||
|
||||
# 定义函数,提取ppt文件中的所有文本内容
|
||||
|
||||
|
||||
def extract_ppt_text(file_path):
|
||||
"""
|
||||
提取PPT文件中的所有文本内容,并以字符串返回。
|
||||
:param file_path: PPT文件路径
|
||||
:return: 所有文本内容(以换行符分隔)
|
||||
"""
|
||||
# 加载ppt文件
|
||||
ppt = Presentation(file_path)
|
||||
# 初始化用于存储ppt文本的列表
|
||||
text_list = []
|
||||
# 遍历PPT中的每一页幻灯片
|
||||
for slide in ppt.slides:
|
||||
# 遍历幻灯片中的每一个形状
|
||||
for shape in slide.shapes:
|
||||
# 判断该形状是否有text属性(即是否包含文本)
|
||||
if hasattr(shape, "text"):
|
||||
# 如果有文本添加到text_list中
|
||||
text_list.append(shape.text)
|
||||
|
||||
all_text = "\n".join(text_list)
|
||||
return all_text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = "example/example.pptx"
|
||||
result = extract_ppt_text(file_path)
|
||||
print(result)
|
||||
Reference in New Issue
Block a user