feat: rag
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
# 读取pdf文件
|
||||
import fitz
|
||||
|
||||
|
||||
def extract_pfd_text(pdf_path):
|
||||
"""
|
||||
提取pdf文件中的内容
|
||||
参数:pdf_path(str):pdf文件路径
|
||||
返回:
|
||||
str:合并后所有页的文本
|
||||
"""
|
||||
# 打开pdf文件
|
||||
pdf = fitz.open(pdf_path)
|
||||
# 存储每一页的信息
|
||||
text_list = []
|
||||
# 遍历pdf中的每一页
|
||||
for page in pdf:
|
||||
text_list.append(page.get_text("text"))
|
||||
# 所有内容合并成一个字符串
|
||||
all_text = "/n".join(text_list)
|
||||
return all_text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pdf_path = "example/example.pdf"
|
||||
result_text = extract_pfd_text(pdf_path)
|
||||
print(result_text)
|
||||
Reference in New Issue
Block a user