03Rag/rag/save.py

from typing import Optional
import logging
import os


from langchain_text_splitters import RecursiveCharacterTextSplitter
from vectorstore import save_text_to_db
from extract_text_auto import extractTextAuto

# 日志打印格式
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)

# 默认保存chromadb集合名称
DEFAULT_COLLECTION_NAME = "rag_system"
# 默认分块大小
DEFAULT_CHUNK_SIZE = 200
# 默认分块重叠度
DEFAULT_CHUNK_OVERLAP = 30


def doc_to_vectorstore(
    file_path: str,
    collection_name: str = DEFAULT_COLLECTION_NAME,
    chunk_size: int = DEFAULT_CHUNK_SIZE,
    chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
) -> int:
    """
    提供文档内容，并分块保存到向量数据库中
    参数：
        file_path：文件路径
        collection_name：集合名称
        chunk_size：分块大小
        chunk_overlap：分块重叠
    """
    # 1. 先加载文件
    text = extractTextAuto(file_path)
    print(text)
    if not text.strip():
        logger.warning(f"文件内容为空:{file_path}")
        return 0

    # 2.进行分块
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    logger.info(f"文件分块完成，共分为{len(chunks)}块")

    # 3.将分好的块，保存到向量化，且保存到向量数据库中
    success_count = 0
    for idx, chunk in enumerate(chunks):
        try:
            save_text_to_db(chunk, collection_name=collection_name)
            success_count += 1
        except Exception as e:
            logger.error(f"保存第{idx+1}块失败：{str(e)}")
    logger.info(
        f"文件{file_path}已经完成向量化并入库，成功保存{success_count}/{len(chunks)}"
    )


doc_to_vectorstore("西游记.txt")