feat: rag
This commit is contained in:
+65
@@ -0,0 +1,65 @@
|
||||
from typing import Optional
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from vectorstore import save_text_to_db
|
||||
from extract_text_auto import extractTextAuto
|
||||
|
||||
# 日志打印格式
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 默认保存chromadb集合名称
|
||||
DEFAULT_COLLECTION_NAME = "rag_system"
|
||||
# 默认分块大小
|
||||
DEFAULT_CHUNK_SIZE = 200
|
||||
# 默认分块重叠度
|
||||
DEFAULT_CHUNK_OVERLAP = 30
|
||||
|
||||
|
||||
def doc_to_vectorstore(
|
||||
file_path: str,
|
||||
collection_name: str = DEFAULT_COLLECTION_NAME,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||
) -> int:
|
||||
"""
|
||||
提供文档内容,并分块保存到向量数据库中
|
||||
参数:
|
||||
file_path:文件路径
|
||||
collection_name:集合名称
|
||||
chunk_size:分块大小
|
||||
chunk_overlap:分块重叠
|
||||
"""
|
||||
# 1. 先加载文件
|
||||
text = extractTextAuto(file_path)
|
||||
print(text)
|
||||
if not text.strip():
|
||||
logger.warning(f"文件内容为空:{file_path}")
|
||||
return 0
|
||||
|
||||
# 2.进行分块
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
chunks = text_splitter.split_text(text)
|
||||
logger.info(f"文件分块完成,共分为{len(chunks)}块")
|
||||
|
||||
# 3.将分好的块,保存到向量化,且保存到向量数据库中
|
||||
success_count = 0
|
||||
for idx, chunk in enumerate(chunks):
|
||||
try:
|
||||
save_text_to_db(chunk, collection_name=collection_name)
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
logger.error(f"保存第{idx+1}块失败:{str(e)}")
|
||||
logger.info(
|
||||
f"文件{file_path}已经完成向量化并入库,成功保存{success_count}/{len(chunks)}"
|
||||
)
|
||||
|
||||
|
||||
doc_to_vectorstore("西游记.txt")
|
||||
Reference in New Issue
Block a user