66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
from typing import Optional
|
|
import logging
|
|
import os
|
|
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
from vectorstore import save_text_to_db
|
|
from extract_text_auto import extractTextAuto
|
|
|
|
# 日志打印格式
|
|
logging.basicConfig(
|
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 默认保存chromadb集合名称
|
|
DEFAULT_COLLECTION_NAME = "rag_system"
|
|
# 默认分块大小
|
|
DEFAULT_CHUNK_SIZE = 200
|
|
# 默认分块重叠度
|
|
DEFAULT_CHUNK_OVERLAP = 30
|
|
|
|
|
|
def doc_to_vectorstore(
|
|
file_path: str,
|
|
collection_name: str = DEFAULT_COLLECTION_NAME,
|
|
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
) -> int:
|
|
"""
|
|
提供文档内容,并分块保存到向量数据库中
|
|
参数:
|
|
file_path:文件路径
|
|
collection_name:集合名称
|
|
chunk_size:分块大小
|
|
chunk_overlap:分块重叠
|
|
"""
|
|
# 1. 先加载文件
|
|
text = extractTextAuto(file_path)
|
|
print(text)
|
|
if not text.strip():
|
|
logger.warning(f"文件内容为空:{file_path}")
|
|
return 0
|
|
|
|
# 2.进行分块
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
)
|
|
chunks = text_splitter.split_text(text)
|
|
logger.info(f"文件分块完成,共分为{len(chunks)}块")
|
|
|
|
# 3.将分好的块,保存到向量化,且保存到向量数据库中
|
|
success_count = 0
|
|
for idx, chunk in enumerate(chunks):
|
|
try:
|
|
save_text_to_db(chunk, collection_name=collection_name)
|
|
success_count += 1
|
|
except Exception as e:
|
|
logger.error(f"保存第{idx+1}块失败:{str(e)}")
|
|
logger.info(
|
|
f"文件{file_path}已经完成向量化并入库,成功保存{success_count}/{len(chunks)}"
|
|
)
|
|
|
|
|
|
doc_to_vectorstore("西游记.txt")
|