Files
03Rag/rag/save.py
T
heyong.fu a17c65c4bc feat: rag
2026-05-06 11:35:10 +08:00

66 lines
1.9 KiB
Python

from typing import Optional
import logging
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from vectorstore import save_text_to_db
from extract_text_auto import extractTextAuto
# 日志打印格式
logging.basicConfig(
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)
# 默认保存chromadb集合名称
DEFAULT_COLLECTION_NAME = "rag_system"
# 默认分块大小
DEFAULT_CHUNK_SIZE = 200
# 默认分块重叠度
DEFAULT_CHUNK_OVERLAP = 30
def doc_to_vectorstore(
file_path: str,
collection_name: str = DEFAULT_COLLECTION_NAME,
chunk_size: int = DEFAULT_CHUNK_SIZE,
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
) -> int:
"""
提供文档内容,并分块保存到向量数据库中
参数:
file_path:文件路径
collection_name:集合名称
chunk_size:分块大小
chunk_overlap:分块重叠
"""
# 1. 先加载文件
text = extractTextAuto(file_path)
print(text)
if not text.strip():
logger.warning(f"文件内容为空:{file_path}")
return 0
# 2.进行分块
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_text(text)
logger.info(f"文件分块完成,共分为{len(chunks)}")
# 3.将分好的块,保存到向量化,且保存到向量数据库中
success_count = 0
for idx, chunk in enumerate(chunks):
try:
save_text_to_db(chunk, collection_name=collection_name)
success_count += 1
except Exception as e:
logger.error(f"保存第{idx+1}块失败:{str(e)}")
logger.info(
f"文件{file_path}已经完成向量化并入库,成功保存{success_count}/{len(chunks)}"
)
doc_to_vectorstore("西游记.txt")